Exemplo n.º 1
0
    def executeInternal(self, apmon, *args, **kwargs):
        #Marco: I guess these value errors only happens for development instances
        if 'task' not in kwargs:
            raise ValueError("No task specified.")
        self.task = kwargs['task']
        if 'tm_taskname' not in self.task:
            raise ValueError("No taskname specified")
        self.workflow = self.task['tm_taskname']
        if 'user_proxy' not in self.task:
            raise ValueError("No proxy provided")
        self.proxy = self.task['user_proxy']

        self.logger.info("About to kill workflow: %s." % self.workflow)

        self.workflow = str(self.workflow)
        if not WORKFLOW_RE.match(self.workflow):
            raise Exception("Invalid workflow name.")

        # Query HTCondor for information about running jobs and update Dashboard appropriately
        if self.task['tm_collector']:
            self.backendurls['htcondorPool'] = self.task['tm_collector']
        loc = HTCondorLocator.HTCondorLocator(self.backendurls)

        address = ""
        try:
            self.schedd, address = loc.getScheddObjNew(self.task['tm_schedd'])
        except Exception as exp:
            msg  = "The CRAB server backend was not able to contact the Grid scheduler."
            msg += " Please try again later."
            msg += " If the error persists send an e-mail to %s." % (FEEDBACKMAIL)
            msg += " Message from the scheduler: %s" % (str(exp))
            self.logger.exception("%s: %s" % (self.workflow, msg))
            raise TaskWorkerException(msg)

        try:
            hostname = socket.getfqdn()
        except:
            hostname = ''

        const = 'CRAB_ReqName =?= %s && TaskType=?="Job"' % HTCondorUtils.quote(self.workflow)
        try:
            for ad in list(self.schedd.xquery(const, ['CRAB_Id', 'CRAB_Retry'])):
                if ('CRAB_Id' not in ad) or ('CRAB_Retry' not in ad):
                    continue
                jobid = str(ad.eval('CRAB_Id'))
                jobretry = str(ad.eval('CRAB_Retry'))
                jinfo = {'broker': hostname,
                         'bossId': jobid,
                         'StatusValue': 'killed',
                        }
                insertJobIdSid(jinfo, jobid, self.workflow, jobretry)
                self.logger.info("Sending kill info to Dashboard: %s" % str(jinfo))
                apmon.sendToML(jinfo)
        except:
            self.logger.exception("Failed to notify Dashboard of job kills") #warning

        # Note that we can not send kills for jobs not in queue at this time; we'll need the
        # DAG FINAL node to be fixed and the node status to include retry number.
        return self.killAll(const)
Exemplo n.º 2
0
    def executeInternal(self, apmon, *args, **kwargs):
        #Marco: I guess these value errors only happens for development instances
        if 'task' not in kwargs:
            raise ValueError("No task specified.")
        self.task = kwargs['task']
        if 'tm_taskname' not in self.task:
            raise ValueError("No taskname specified")
        self.workflow = self.task['tm_taskname']
        if 'user_proxy' not in self.task:
            raise ValueError("No proxy provided")
        self.proxy = self.task['user_proxy']

        self.logger.info("About to kill workflow: %s." % self.workflow)

        self.workflow = str(self.workflow)
        if not WORKFLOW_RE.match(self.workflow):
            raise Exception("Invalid workflow name.")

        # Query HTCondor for information about running jobs and update Dashboard appropriately
        if self.task['tm_collector']:
            self.backendurls['htcondorPool'] = self.task['tm_collector']
        loc = HTCondorLocator.HTCondorLocator(self.backendurls)

        address = ""
        try:
            self.schedd, address = loc.getScheddObjNew(self.task['tm_schedd'])
        except Exception as exp:
            msg  = "The CRAB server backend was not able to contact the Grid scheduler."
            msg += " Please try again later."
            msg += " If the error persists send an e-mail to %s." % (FEEDBACKMAIL)
            msg += " Message from the scheduler: %s" % (str(exp))
            self.logger.exception("%s: %s" % (self.workflow, msg))
            raise TaskWorkerException(msg)

        try:
            hostname = socket.getfqdn()
        except:
            hostname = ''

        const = 'CRAB_ReqName =?= %s && TaskType=?="Job"' % HTCondorUtils.quote(self.workflow)
        try:
            for ad in list(self.schedd.xquery(const, ['CRAB_Id', 'CRAB_Retry'])):
                if ('CRAB_Id' not in ad) or ('CRAB_Retry' not in ad):
                    continue
                jobid = str(ad.eval('CRAB_Id'))
                jobretry = str(ad.eval('CRAB_Retry'))
                jinfo = {'broker': hostname,
                         'bossId': jobid,
                         'StatusValue': 'killed',
                        }
                insertJobIdSid(jinfo, jobid, self.workflow, jobretry)
                self.logger.info("Sending kill info to Dashboard: %s" % str(jinfo))
                apmon.sendToML(jinfo)
        except:
            self.logger.exception("Failed to notify Dashboard of job kills") #warning

        # Note that we can not send kills for jobs not in queue at this time; we'll need the
        # DAG FINAL node to be fixed and the node status to include retry number.
        return self.killAll(const)
Exemplo n.º 3
0
    def update_dashboard(self, crab_retry):
        """
        Need a doc string here.
        """
        if not self.task_ad:
            return
        params = {
            'tool':
            'crab3',
            'SubmissionType':
            'crab3',
            'JSToolVersion':
            '3.3.0',
            'tool_ui':
            os.environ.get('HOSTNAME', ''),
            'scheduler':
            'GLIDEIN',
            'GridName':
            self.task_ad['CRAB_UserDN'],
            'ApplicationVersion':
            self.task_ad['CRAB_JobSW'],
            'taskType':
            self.task_ad.get("CRAB_DashboardTaskType", 'analysistest'),
            'vo':
            'cms',
            'CMSUser':
            self.task_ad['CRAB_UserHN'],
            'user':
            self.task_ad['CRAB_UserHN'],
            'taskId':
            self.task_ad['CRAB_ReqName'],
            'datasetFull':
            self.task_ad['DESIRED_CMSDataset'],
            'resubmitter':
            self.task_ad['CRAB_UserHN'],
            'exe':
            'cmsRun',
            'broker':
            self.backend,
            'bossId':
            str(self.job_id),
            'localId':
            '',
            'SyncGridJobId':
            'https://glidein.cern.ch/%s/%s' %
            (self.job_id, self.task_ad['CRAB_ReqName'].replace("_", ":")),
        }

        if not self.userWebDirPrx:
            storage_rules = htcondor.param['CRAB_StorageRules']
            self.userWebDirPrx = getWebdirForDb(
                str(self.task_ad.get('CRAB_ReqName')), storage_rules)

        self.logger.info("User web dir: %s", self.userWebDirPrx)

        insertJobIdSid(params, self.job_id, self.task_ad['CRAB_ReqName'],
                       crab_retry)
Exemplo n.º 4
0
    def update_dashboard(self, crab_retry):
        """
        Need a doc string here.
        """
        if not self.task_ad:
            return
        params = {'tool': 'crab3',
                  'SubmissionType': 'crab3',
                  'JSToolVersion': '3.3.0',
                  'tool_ui': os.environ.get('HOSTNAME', ''),
                  'scheduler': 'GLIDEIN',
                  'GridName': self.task_ad['CRAB_UserDN'],
                  'ApplicationVersion': self.task_ad['CRAB_JobSW'],
                  'taskType': self.task_ad.get("CRAB_DashboardTaskType", 'analysistest'),
                  'vo': 'cms',
                  'CMSUser': self.task_ad['CRAB_UserHN'],
                  'user': self.task_ad['CRAB_UserHN'],
                  'taskId': self.task_ad['CRAB_ReqName'],
                  'datasetFull': self.task_ad['DESIRED_CMSDataset'],
                  'resubmitter': self.task_ad['CRAB_UserHN'],
                  'exe': 'cmsRun',
                  'broker': self.backend,
                  'bossId': str(self.job_id),
                  'localId': '',
                  'SyncGridJobId': 'https://glidein.cern.ch/%s/%s' % (self.job_id, self.task_ad['CRAB_ReqName'].replace("_", ":")),
                 }

        storage_rules = htcondor.param['CRAB_StorageRules']
        userWebDir = getWebdirForDb(str(self.task_ad.get('CRAB_ReqName')), storage_rules)

        userWebDirPrx = ""
        try:
            with open('proxied_webdir') as fd:
                proxied_webdir = fd.read()
            userWebDirPrx = proxied_webdir
        except IOError as e:
            self.logger.error(("'I/O error(%s): %s', when looking for the proxied_webdir file. Might be normal"
                         " if the schedd does not have a proxiedurl in the REST external config." % (e.errno, e.strerror)))

        self.logger.info("User web dir proxy: " + userWebDirPrx)
        self.logger.info("web dir: " + userWebDir)

        if userWebDirPrx:
            setDashboardLogs(params, userWebDirPrx, self.job_id, crab_retry)
        elif userWebDir:
            setDashboardLogs(params, userWebDir, self.job_id, crab_retry)
        else:
            print("Not setting dashboard logfiles as I cannot find CRAB_UserWebDir nor CRAB_UserWebDirPrx.")

        insertJobIdSid(params, self.job_id, self.task_ad['CRAB_ReqName'], crab_retry)
        apmon = ApmonIf()
        self.logger.debug("Dashboard task info: %s" % str(params))
        apmon.sendToML(params)
        apmon.free()
Exemplo n.º 5
0
    def update_dashboard(self, crab_retry):
        """
        Need a doc string here.
        """
        if not self.task_ad:
            return
        params = {'tool': 'crab3',
                  'SubmissionType': 'crab3',
                  'JSToolVersion': '3.3.0',
                  'tool_ui': os.environ.get('HOSTNAME', ''),
                  'scheduler': 'GLIDEIN',
                  'GridName': self.task_ad['CRAB_UserDN'],
                  'ApplicationVersion': self.task_ad['CRAB_JobSW'],
                  'taskType': self.task_ad.get("CRAB_DashboardTaskType", 'analysistest'),
                  'vo': 'cms',
                  'CMSUser': self.task_ad['CRAB_UserHN'],
                  'user': self.task_ad['CRAB_UserHN'],
                  'taskId': self.task_ad['CRAB_ReqName'],
                  'datasetFull': self.task_ad['DESIRED_CMSDataset'],
                  'resubmitter': self.task_ad['CRAB_UserHN'],
                  'exe': 'cmsRun',
                  'broker': self.backend,
                  'bossId': str(self.job_id),
                  'localId': '',
                  'SyncGridJobId': 'https://glidein.cern.ch/%s/%s' % (self.job_id, self.task_ad['CRAB_ReqName'].replace("_", ":")),
                 }

        storage_rules = htcondor.param['CRAB_StorageRules']
        userWebDir = getWebdirForDb(str(self.task_ad.get('CRAB_ReqName')), storage_rules)

        userWebDirPrx = ""
        try:
            with open('proxied_webdir') as fd:
                proxied_webdir = fd.read()
            userWebDirPrx = proxied_webdir
        except IOError as e:
            self.logger.error(("'I/O error(%s): %s', when looking for the proxied_webdir file. Might be normal"
                         " if the schedd does not have a proxiedurl in the REST external config." % (e.errno, e.strerror)))

        self.logger.info("User web dir proxy: " + userWebDirPrx)
        self.logger.info("web dir: " + userWebDir)

        if userWebDirPrx:
            setDashboardLogs(params, userWebDirPrx, self.job_id, crab_retry)
        elif userWebDir:
            setDashboardLogs(params, userWebDir, self.job_id, crab_retry)
        else:
            print("Not setting dashboard logfiles as I cannot find CRAB_UserWebDir nor CRAB_UserWebDirPrx.")

        insertJobIdSid(params, self.job_id, self.task_ad['CRAB_ReqName'], crab_retry)
        apmon = ApmonIf()
        self.logger.debug("Dashboard task info: %s" % str(params))
        apmon.sendToML(params)
        apmon.free()
Exemplo n.º 6
0
    def createSubdag(self, splitterResult, **kwargs):

        startjobid = 0
        dagSpecs = []

        if hasattr(self.config.TaskWorker, 'stageoutPolicy'):
            kwargs['task']['stageoutpolicy'] = ",".join(
                self.config.TaskWorker.stageoutPolicy)
        else:
            kwargs['task']['stageoutpolicy'] = "local,remote"

        ## In the future this parameter may be set by the user in the CRAB configuration
        ## file and we would take it from the Task DB.
        kwargs['task']['numautomjobretries'] = getattr(self.config.TaskWorker,
                                                       'numAutomJobRetries', 2)

        info = self.makeJobSubmit(kwargs['task'])

        outfiles = kwargs['task']['tm_outfiles'] + kwargs['task'][
            'tm_tfile_outfiles'] + kwargs['task']['tm_edm_outfiles']

        os.chmod("CMSRunAnalysis.sh", 0o755)

        # This config setting acts as a global black list
        global_blacklist = set(self.getBlacklistedSites())
        self.logger.debug("CRAB site blacklist: %s" % (list(global_blacklist)))

        # This is needed for Site Metrics
        # It should not block any site for Site Metrics and if needed for other activities
        # self.config.TaskWorker.ActivitiesToRunEverywhere = ['hctest', 'hcdev']
        if hasattr(self.config.TaskWorker, 'ActivitiesToRunEverywhere') and \
                   kwargs['task']['tm_activity'] in self.config.TaskWorker.ActivitiesToRunEverywhere:
            global_blacklist = set()
            self.logger.debug("Ignoring the CRAB site blacklist.")

        sitead = classad.ClassAd()
        siteinfo = {'group_sites': {}, 'group_datasites': {}}

        blocksWithNoLocations = set()

        siteWhitelist = set(kwargs['task']['tm_site_whitelist'])
        siteBlacklist = set(kwargs['task']['tm_site_blacklist'])
        self.logger.debug("Site whitelist: %s" % (list(siteWhitelist)))
        self.logger.debug("Site blacklist: %s" % (list(siteBlacklist)))

        if siteWhitelist & global_blacklist:
            msg = "The following sites from the user site whitelist are blacklisted by the CRAB server: %s." % (
                list(siteWhitelist & global_blacklist))
            msg += " Since the CRAB server blacklist has precedence, these sites are not considered in the user whitelist."
            self.uploadWarning(msg, kwargs['task']['user_proxy'],
                               kwargs['task']['tm_taskname'])
            self.logger.warning(msg)

        if siteBlacklist & siteWhitelist:
            msg = "The following sites appear in both the user site blacklist and whitelist: %s." % (
                list(siteBlacklist & siteWhitelist))
            msg += " Since the whitelist has precedence, these sites are not considered in the blacklist."
            self.uploadWarning(msg, kwargs['task']['user_proxy'],
                               kwargs['task']['tm_taskname'])
            self.logger.warning(msg)

        ignoreLocality = kwargs['task']['tm_ignore_locality'] == 'T'
        self.logger.debug("Ignore locality: %s" % (ignoreLocality))

        for jobgroup in splitterResult:
            jobs = jobgroup.getJobs()

            blocks = set()
            for job in jobs:
                for inputfile in job['input_files']:
                    blocks.add(inputfile['block'])
            self.logger.debug("Blocks: %s" % list(blocks))

            if not jobs:
                locations = set()
            else:
                locations = set(jobs[0]['input_files'][0]['locations'])
            self.logger.debug("Locations: %s" % (list(locations)))

            ## Discard the blocks that have no locations. This can happen when a block is
            ## still open in PhEDEx. Newly created datasets from T0 (at least) have a large
            ## chance of having some block which is closed in DBS but not in PhEDEx.
            ## Open blocks in PhEDEx can have a location; it is WMCore who is returning no
            ## location.
            ## This is how a block is constructed during data taking:
            ## 1) an open block in T0 is injected in PhEDEx;
            ## 2) files are being added to the block in T0;
            ## 3) data are transferred by PhEDEx if a subscription is present;
            ## 4) once the block is finished:
            ##   a) the block is inserted into DBS as a closed block (before this, DBS has
            ##      no knowledge about the block);
            ##   b) block is closed in PhEDEx.
            if not locations and not ignoreLocality:
                blocksWithNoLocations = blocksWithNoLocations.union(blocks)
                continue

            if ignoreLocality:
                sbj = SiteDB.SiteDBJSON({
                    "key": self.config.TaskWorker.cmskey,
                    "cert": self.config.TaskWorker.cmscert
                })
                try:
                    possiblesites = set(sbj.getAllCMSNames())
                except Exception as ex:
                    msg = "The CRAB3 server backend could not contact SiteDB to get the list of all CMS sites."
                    msg += " This could be a temporary SiteDB glitch."
                    msg += " Please try to submit a new task (resubmit will not work)"
                    msg += " and contact the experts if the error persists."
                    msg += "\nError reason: %s" % (
                        str(ex)
                    )  #TODO add the sitedb url so the user can check themselves!
                    raise TaskWorker.WorkerExceptions.TaskWorkerException(msg)
            else:
                possiblesites = locations
            ## At this point 'possiblesites' should never be empty.
            self.logger.debug("Possible sites: %s" % (list(possiblesites)))

            ## Apply the global site blacklist.
            availablesites = possiblesites - global_blacklist

            ## TODO: The messsages below do not clarify that here it only matters the part
            ## of the dataset that passed the lumi-mask/run-range selection.

            ## Abort the submission of the task if (part of?) the dataset is available only
            ## on sites that are blacklisted by the CRAB server.
            ## Or should we submit at least the jobs on the part of the dataset that
            ## survives the blacklisting? Comment S.Belforte Sep,2015: So far DDM policy
            ## is to replicate entire datasets, not scatter them around. Once we will have
            ## very large datasets that can happen, but it is not the case now.
            if not availablesites:
                msg = "The CRAB server backend refuses to send jobs to the Grid scheduler."
                msg += " No site available for submission of task %s" % (
                    kwargs['task']['tm_taskname'])
                msg += "\n\t\t\t\tThe sites available for submission of task %s are blacklisted by the CRAB3 server." % (
                    kwargs['task']['tm_taskname'])
                msg += "\n\t\t\t\tThis is the list of in principle available sites: %s" % (
                    list(possiblesites))
                msg += "\n\t\t\t\tThis is the list of sites that are blacklisted by the CRAB3 server: %s" % (
                    list(global_blacklist))
                raise TaskWorker.WorkerExceptions.NoAvailableSite(msg)

            ## Abort the submission of the task if (part of?) the dataset is available only
            ## on sites that are removed after applying the user site blacklist/whitelist.
            ## Or should we submit at least the jobs on the part of the dataset that
            ## survives the blacklisting/whitelisting? (See S.Belforte comment above.)
            # NOTE: User can still shoot themselves in the foot with the resubmit blacklist
            # However, this is the last chance we have to warn the users about an impossible task at submit time.
            available = set(availablesites)
            if siteWhitelist:
                available &= siteWhitelist
                if not available:
                    msg = "The CRAB server backend refuses to send jobs to the Grid scheduler."
                    msg += " You put %s as site whitelist," % (
                        list(siteWhitelist))
                    msg += " but the input dataset '%s' can only be accessed at these sites: %s." % (
                        kwargs['task']['tm_input_dataset'],
                        list(availablesites))
                    msg += " Please check your site whitelist."
                    raise TaskWorker.WorkerExceptions.NoAvailableSite(msg)
            available -= (siteBlacklist - siteWhitelist)
            if not available:
                msg = "The CRAB server backend refuses to send jobs to the Grid scheduler."
                msg += " You put %s as site blacklist," % (list(siteBlacklist -
                                                                siteWhitelist))
                msg += " when the input dataset '%s' can actually only be accessed at these sites: %s." % (
                    kwargs['task']['tm_input_dataset'], list(availablesites))
                msg += " Please check in DAS the locations of the input dataset."
                msg += " Hint: the ignoreLocality option might help."
                raise TaskWorker.WorkerExceptions.NoAvailableSite(msg)

            availablesites = [str(i) for i in availablesites]
            datasites = jobs[0]['input_files'][0]['locations']
            self.logger.info("Resulting available sites: %s" %
                             (list(availablesites)))

            if siteWhitelist or siteBlacklist:
                msg = "The site whitelist and blacklist will be applied by the pre-job."
                msg += " This is expected to result in DESIRED_SITES = %s" % (
                    list(available))
                self.logger.debug(msg)

            jobgroupDagSpecs, startjobid = self.makeDagSpecs(
                kwargs['task'], sitead, siteinfo, jobgroup,
                list(blocks)[0], availablesites, datasites, outfiles,
                startjobid)
            dagSpecs += jobgroupDagSpecs

        if not dagSpecs:
            msg = "No jobs created for task %s." % (
                kwargs['task']['tm_taskname'])
            if blocksWithNoLocations:
                msg = "The CRAB server backend refuses to send jobs to the Grid scheduler."
                msg += " No locations found for dataset '%s'" % (
                    kwargs['task']['tm_input_dataset'])
                msg += " (or at least for the part of the dataset that passed the lumi-mask and/or run-range selection)."
            raise TaskWorker.WorkerExceptions.NoAvailableSite(msg)
        if blocksWithNoLocations:
            msg = "The following blocks from dataset '%s' were skipped," % (
                kwargs['task']['tm_input_dataset'])
            msg += " because they have no locations: %s." % (sorted(
                list(blocksWithNoLocations)))
            self.uploadWarning(msg, kwargs['task']['user_proxy'],
                               kwargs['task']['tm_taskname'])
            self.logger.warning(msg)

        ## Write down the DAG as needed by DAGMan.
        dag = DAG_HEADER % {
            'resthost': kwargs['task']['resthost'],
            'resturiwfdb': kwargs['task']['resturinoapi'] + '/workflowdb'
        }
        for dagSpec in dagSpecs:
            dag += DAG_FRAGMENT % dagSpec

        ## Create a tarball with all the job lumi files.
        run_and_lumis_tar = tarfile.open("run_and_lumis.tar.gz", "w:gz")
        ## Also creating a tarball with the dataset input files.
        ## Each .txt file in the tarball contains a list of dataset files to be used for the job.
        input_files_tar = tarfile.open("input_files.tar.gz", "w:gz")
        for dagSpec in dagSpecs:
            job_lumis_file = 'job_lumis_' + str(dagSpec['count']) + '.json'
            job_input_file_list = 'job_input_file_list_' + str(
                dagSpec['count']) + '.txt'
            with open(job_lumis_file, "w") as fd:
                fd.write(str(dagSpec['runAndLumiMask']))
            with open(job_input_file_list, "w") as fd:
                fd.write(str(dagSpec['inputFiles']))
            run_and_lumis_tar.add(job_lumis_file)
            input_files_tar.add(job_input_file_list)
            os.remove(job_lumis_file)
            os.remove(job_input_file_list)
        run_and_lumis_tar.close()
        input_files_tar.close()

        ## Save the DAG into a file.
        with open("RunJobs.dag", "w") as fd:
            fd.write(dag)

        with open("site.ad", "w") as fd:
            fd.write(str(sitead))

        with open("site.ad.json", "w") as fd:
            json.dump(siteinfo, fd)

        task_name = kwargs['task'].get('CRAB_ReqName',
                                       kwargs['task'].get('tm_taskname', ''))
        userdn = kwargs['task'].get('CRAB_UserDN',
                                    kwargs['task'].get('tm_user_dn', ''))

        info["jobcount"] = len(dagSpecs)
        maxpost = getattr(self.config.TaskWorker, 'maxPost', 20)
        if maxpost == -1:
            maxpost = info['jobcount']
        elif maxpost == 0:
            maxpost = int(max(20, info['jobcount'] * .1))
        info['maxpost'] = maxpost

        if info.get('faillimit') == None:
            info['faillimit'] = -1
            #if info['jobcount'] > 200
            #    info['faillimit'] = 100
            #else:
            #    info['faillimit'] = -1
        elif info.get('faillimit') < 0:
            info['faillimit'] = -1

        # Info for ML:
        target_se = ''
        max_len_target_se = 900
        for site in map(str, availablesites):
            if len(target_se) > max_len_target_se:
                target_se += ',Many_More'
                break
            if len(target_se):
                target_se += ','
            target_se += site
        ml_info = info.setdefault('apmon', [])
        for idx in range(1, info['jobcount'] + 1):
            taskid = kwargs['task']['tm_taskname']
            jinfo = {
                'broker': os.environ.get('HOSTNAME', ''),
                'bossId': str(idx),
                'TargetSE': target_se,
                'localId': '',
                'StatusValue': 'pending',
            }
            insertJobIdSid(jinfo, idx, taskid, 0)
            ml_info.append(jinfo)

        # When running in standalone mode, we want to record the number of jobs in the task
        if ('CRAB_ReqName' in kwargs['task']) and ('CRAB_UserDN'
                                                   in kwargs['task']):
            const = 'TaskType =?= \"ROOT\" && CRAB_ReqName =?= "%s" && CRAB_UserDN =?= "%s"' % (
                task_name, userdn)
            cmd = "condor_qedit -const '%s' CRAB_JobCount %d" % (const,
                                                                 len(dagSpecs))
            self.logger.debug("+ %s" % cmd)
            status, output = commands.getstatusoutput(cmd)
            if status:
                self.logger.error(output)
                self.logger.error("Failed to record the number of jobs.")
                return 1

        return info, splitterResult
Exemplo n.º 7
0
    def createSubdag(self, splitterResult, **kwargs):

        startjobid = 0
        dagSpecs = []

        if hasattr(self.config.TaskWorker, 'stageoutPolicy'):
            kwargs['task']['stageoutpolicy'] = ",".join(self.config.TaskWorker.stageoutPolicy)
        else:
            kwargs['task']['stageoutpolicy'] = "local,remote"

        ## In the future this parameter may be set by the user in the CRAB configuration
        ## file and we would take it from the Task DB.
        kwargs['task']['numautomjobretries'] = getattr(self.config.TaskWorker, 'numAutomJobRetries', 2)

        info = self.makeJobSubmit(kwargs['task'])

        outfiles = kwargs['task']['tm_outfiles'] + kwargs['task']['tm_tfile_outfiles'] + kwargs['task']['tm_edm_outfiles']

        os.chmod("CMSRunAnalysis.sh", 0o755)

        # This config setting acts as a global black list
        global_blacklist = set(self.getBlacklistedSites())
        self.logger.debug("CRAB site blacklist: %s" % (list(global_blacklist)))

        # This is needed for Site Metrics
        # It should not block any site for Site Metrics and if needed for other activities
        # self.config.TaskWorker.ActivitiesToRunEverywhere = ['hctest', 'hcdev']
        if hasattr(self.config.TaskWorker, 'ActivitiesToRunEverywhere') and \
                   kwargs['task']['tm_activity'] in self.config.TaskWorker.ActivitiesToRunEverywhere:
            global_blacklist = set()
            self.logger.debug("Ignoring the CRAB site blacklist.")

        sitead = classad.ClassAd()
        siteinfo = {'group_sites': {}, 'group_datasites': {}}

        blocksWithNoLocations = set()

        siteWhitelist = set(kwargs['task']['tm_site_whitelist'])
        siteBlacklist = set(kwargs['task']['tm_site_blacklist'])
        self.logger.debug("Site whitelist: %s" % (list(siteWhitelist)))
        self.logger.debug("Site blacklist: %s" % (list(siteBlacklist)))

        if siteWhitelist & global_blacklist:
            msg  = "The following sites from the user site whitelist are blacklisted by the CRAB server: %s." % (list(siteWhitelist & global_blacklist))
            msg += " Since the CRAB server blacklist has precedence, these sites are not considered in the user whitelist."
            self.uploadWarning(msg, kwargs['task']['user_proxy'], kwargs['task']['tm_taskname'])
            self.logger.warning(msg)

        if siteBlacklist & siteWhitelist:
            msg  = "The following sites appear in both the user site blacklist and whitelist: %s." % (list(siteBlacklist & siteWhitelist))
            msg += " Since the whitelist has precedence, these sites are not considered in the blacklist."
            self.uploadWarning(msg, kwargs['task']['user_proxy'], kwargs['task']['tm_taskname'])
            self.logger.warning(msg)

        ignoreLocality = kwargs['task']['tm_ignore_locality'] == 'T'
        self.logger.debug("Ignore locality: %s" % (ignoreLocality))

        for jobgroup in splitterResult:
            jobs = jobgroup.getJobs()

            blocks = set()
            for job in jobs:
                for inputfile in job['input_files']:
                    blocks.add(inputfile['block'])
            self.logger.debug("Blocks: %s" % list(blocks))

            if not jobs:
                locations = set()
            else:
                locations = set(jobs[0]['input_files'][0]['locations'])
            self.logger.debug("Locations: %s" % (list(locations)))

            ## Discard the blocks that have no locations. This can happen when a block is
            ## still open in PhEDEx. Newly created datasets from T0 (at least) have a large
            ## chance of having some block which is closed in DBS but not in PhEDEx.
            ## Open blocks in PhEDEx can have a location; it is WMCore who is returning no
            ## location.
            ## This is how a block is constructed during data taking:
            ## 1) an open block in T0 is injected in PhEDEx;
            ## 2) files are being added to the block in T0;
            ## 3) data are transferred by PhEDEx if a subscription is present;
            ## 4) once the block is finished:
            ##   a) the block is inserted into DBS as a closed block (before this, DBS has
            ##      no knowledge about the block);
            ##   b) block is closed in PhEDEx.
            if not locations and not ignoreLocality:
                blocksWithNoLocations = blocksWithNoLocations.union(blocks)
                continue

            if ignoreLocality:
                sbj = SiteDB.SiteDBJSON({"key": self.config.TaskWorker.cmskey,
                                         "cert": self.config.TaskWorker.cmscert})
                try:
                    possiblesites = set(sbj.getAllCMSNames())
                except Exception as ex:
                    msg  = "The CRAB3 server backend could not contact SiteDB to get the list of all CMS sites."
                    msg += " This could be a temporary SiteDB glitch."
                    msg += " Please try to submit a new task (resubmit will not work)"
                    msg += " and contact the experts if the error persists."
                    msg += "\nError reason: %s" % (str(ex)) #TODO add the sitedb url so the user can check themselves!
                    raise TaskWorker.WorkerExceptions.TaskWorkerException(msg)
            else:
                possiblesites = locations
            ## At this point 'possiblesites' should never be empty.
            self.logger.debug("Possible sites: %s" % (list(possiblesites)))

            ## Apply the global site blacklist.
            availablesites = possiblesites - global_blacklist

            ## TODO: The messsages below do not clarify that here it only matters the part
            ## of the dataset that passed the lumi-mask/run-range selection.

            ## Abort the submission of the task if (part of?) the dataset is available only
            ## on sites that are blacklisted by the CRAB server.
            ## Or should we submit at least the jobs on the part of the dataset that
            ## survives the blacklisting? Comment S.Belforte Sep,2015: So far DDM policy
            ## is to replicate entire datasets, not scatter them around. Once we will have
            ## very large datasets that can happen, but it is not the case now.
            if not availablesites:
                msg  = "The CRAB server backend refuses to send jobs to the Grid scheduler."
                msg += " No site available for submission of task %s" % (kwargs['task']['tm_taskname'])
                msg += "\n\t\t\t\tThe sites available for submission of task %s are blacklisted by the CRAB3 server." % (kwargs['task']['tm_taskname'])
                msg += "\n\t\t\t\tThis is the list of in principle available sites: %s" % (list(possiblesites))
                msg += "\n\t\t\t\tThis is the list of sites that are blacklisted by the CRAB3 server: %s" % (list(global_blacklist))
                raise TaskWorker.WorkerExceptions.NoAvailableSite(msg)

            ## Abort the submission of the task if (part of?) the dataset is available only
            ## on sites that are removed after applying the user site blacklist/whitelist.
            ## Or should we submit at least the jobs on the part of the dataset that
            ## survives the blacklisting/whitelisting? (See S.Belforte comment above.)
            # NOTE: User can still shoot themselves in the foot with the resubmit blacklist
            # However, this is the last chance we have to warn the users about an impossible task at submit time.
            available = set(availablesites)
            if siteWhitelist:
                available &= siteWhitelist
                if not available:
                    msg  = "The CRAB server backend refuses to send jobs to the Grid scheduler."
                    msg += " You put %s as site whitelist," % (list(siteWhitelist))
                    msg += " but the input dataset '%s' can only be accessed at these sites: %s." % (kwargs['task']['tm_input_dataset'], list(availablesites))
                    msg += " Please check your site whitelist."
                    raise TaskWorker.WorkerExceptions.NoAvailableSite(msg)
            available -= (siteBlacklist - siteWhitelist)
            if not available:
                msg  = "The CRAB server backend refuses to send jobs to the Grid scheduler."
                msg += " You put %s as site blacklist," % (list(siteBlacklist - siteWhitelist))
                msg += " when the input dataset '%s' can actually only be accessed at these sites: %s." % (kwargs['task']['tm_input_dataset'], list(availablesites))
                msg += " Please check in DAS the locations of the input dataset."
                msg += " Hint: the ignoreLocality option might help."
                raise TaskWorker.WorkerExceptions.NoAvailableSite(msg)

            availablesites = [str(i) for i in availablesites]
            datasites = jobs[0]['input_files'][0]['locations']
            self.logger.info("Resulting available sites: %s" % (list(availablesites)))

            if siteWhitelist or siteBlacklist:
                msg  = "The site whitelist and blacklist will be applied by the pre-job."
                msg += " This is expected to result in DESIRED_SITES = %s" % (list(available))
                self.logger.debug(msg)

            jobgroupDagSpecs, startjobid = self.makeDagSpecs(kwargs['task'], sitead, siteinfo, jobgroup, list(blocks)[0], availablesites, datasites, outfiles, startjobid)
            dagSpecs += jobgroupDagSpecs

        if not dagSpecs:
            msg = "No jobs created for task %s." % (kwargs['task']['tm_taskname'])
            if blocksWithNoLocations:
                msg  = "The CRAB server backend refuses to send jobs to the Grid scheduler."
                msg += " No locations found for dataset '%s'" % (kwargs['task']['tm_input_dataset'])
                msg += " (or at least for the part of the dataset that passed the lumi-mask and/or run-range selection)."
            raise TaskWorker.WorkerExceptions.NoAvailableSite(msg)
        if blocksWithNoLocations:
            msg  = "The following blocks from dataset '%s' were skipped," % (kwargs['task']['tm_input_dataset'])
            msg += " because they have no locations: %s." % (sorted(list(blocksWithNoLocations)))
            self.uploadWarning(msg, kwargs['task']['user_proxy'], kwargs['task']['tm_taskname'])
            self.logger.warning(msg)

        ## Write down the DAG as needed by DAGMan.
        dag = DAG_HEADER % {'resthost': kwargs['task']['resthost'], 'resturiwfdb': kwargs['task']['resturinoapi'] + '/workflowdb'}
        for dagSpec in dagSpecs:
            dag += DAG_FRAGMENT % dagSpec

        ## Create a tarball with all the job lumi files.
        run_and_lumis_tar = tarfile.open("run_and_lumis.tar.gz", "w:gz")
        ## Also creating a tarball with the dataset input files.
        ## Each .txt file in the tarball contains a list of dataset files to be used for the job.
        input_files_tar = tarfile.open("input_files.tar.gz", "w:gz")
        for dagSpec in dagSpecs:
            job_lumis_file = 'job_lumis_'+ str(dagSpec['count']) +'.json'
            job_input_file_list = 'job_input_file_list_' + str(dagSpec['count']) + '.txt'
            with open(job_lumis_file, "w") as fd:
                fd.write(str(dagSpec['runAndLumiMask']))
            with open(job_input_file_list, "w") as fd:
                fd.write(str(dagSpec['inputFiles']))
            run_and_lumis_tar.add(job_lumis_file)
            input_files_tar.add(job_input_file_list)
            os.remove(job_lumis_file)
            os.remove(job_input_file_list)
        run_and_lumis_tar.close()
        input_files_tar.close()

        ## Save the DAG into a file.
        with open("RunJobs.dag", "w") as fd:
            fd.write(dag)

        with open("site.ad", "w") as fd:
            fd.write(str(sitead))

        with open("site.ad.json", "w") as fd:
            json.dump(siteinfo, fd)

        task_name = kwargs['task'].get('CRAB_ReqName', kwargs['task'].get('tm_taskname', ''))
        userdn = kwargs['task'].get('CRAB_UserDN', kwargs['task'].get('tm_user_dn', ''))

        info["jobcount"] = len(dagSpecs)
        maxpost = getattr(self.config.TaskWorker, 'maxPost', 20)
        if maxpost == -1:
            maxpost = info['jobcount']
        elif maxpost == 0:
            maxpost = int(max(20, info['jobcount']*.1))
        info['maxpost'] = maxpost

        if info.get('faillimit') == None:
            info['faillimit'] = -1
            #if info['jobcount'] > 200
            #    info['faillimit'] = 100
            #else:
            #    info['faillimit'] = -1
        elif info.get('faillimit') < 0:
            info['faillimit'] = -1

        # Info for ML:
        target_se = ''
        max_len_target_se = 900
        for site in map(str, availablesites):
            if len(target_se) > max_len_target_se:
                target_se += ',Many_More'
                break
            if len(target_se):
                target_se += ','
            target_se += site
        ml_info = info.setdefault('apmon', [])
        for idx in range(1, info['jobcount']+1):
            taskid = kwargs['task']['tm_taskname']
            jinfo = {'broker': os.environ.get('HOSTNAME', ''),
                     'bossId': str(idx),
                     'TargetSE': target_se,
                     'localId': '',
                     'StatusValue': 'pending',
                    }
            insertJobIdSid(jinfo, idx, taskid, 0)
            ml_info.append(jinfo)

        # When running in standalone mode, we want to record the number of jobs in the task
        if ('CRAB_ReqName' in kwargs['task']) and ('CRAB_UserDN' in kwargs['task']):
            const = 'TaskType =?= \"ROOT\" && CRAB_ReqName =?= "%s" && CRAB_UserDN =?= "%s"' % (task_name, userdn)
            cmd = "condor_qedit -const '%s' CRAB_JobCount %d" % (const, len(dagSpecs))
            self.logger.debug("+ %s" % cmd)
            status, output = commands.getstatusoutput(cmd)
            if status:
                self.logger.error(output)
                self.logger.error("Failed to record the number of jobs.")
                return 1

        return info, splitterResult
Exemplo n.º 8
0
    def createSubdag(self, splitterResult, **kwargs):

        startjobid = kwargs.get('startjobid', 0)
        subjob = kwargs.get('subjob', None)
        stage = kwargs.get('stage', 'conventional')
        self.logger.debug('starting createSubdag, kwargs are:')
        self.logger.debug(str(kwargs))
        dagSpecs = []
        subdags = []

        if hasattr(self.config.TaskWorker, 'stageoutPolicy'):
            kwargs['task']['stageoutpolicy'] = ",".join(self.config.TaskWorker.stageoutPolicy)
        else:
            kwargs['task']['stageoutpolicy'] = "local,remote"

        ## In the future this parameter may be set by the user in the CRAB configuration
        ## file and we would take it from the Task DB.
        kwargs['task']['numautomjobretries'] = getattr(self.config.TaskWorker, 'numAutomJobRetries', 2)

        kwargs['task']['max_runtime'] = kwargs['task']['tm_split_args'].get('seconds_per_job', -1)
        if kwargs['task']['tm_split_algo'] == 'Automatic' and stage == 'conventional':
            kwargs['task']['max_runtime'] = getattr(self.config.TaskWorker, 'splittingPilotRuntime', 15 * 60)
            kwargs['task']['completion_jobs'] = getattr(self.config.TaskWorker, 'completionJobs', False)
            outfiles = []
            stage = 'probe'
        if stage == 'process' and not kwargs['task']['completion_jobs']:
            kwargs['task']['max_runtime'] = -1

        if stage == 'probe':
            parent = None
            startjobid = -1
        else:
            parent = startjobid

        info = self.makeJobSubmit(kwargs['task'])

        outfiles = kwargs['task']['tm_outfiles'] + kwargs['task']['tm_tfile_outfiles'] + kwargs['task']['tm_edm_outfiles']

        os.chmod("CMSRunAnalysis.sh", 0o755)

        # This config setting acts as a global black list
        global_blacklist = set(self.getBlacklistedSites())
        self.logger.debug("CRAB site blacklist: %s" % (list(global_blacklist)))

        # This is needed for Site Metrics
        # It should not block any site for Site Metrics and if needed for other activities
        # self.config.TaskWorker.ActivitiesToRunEverywhere = ['hctest', 'hcdev']
        # The other case where the blacklist is ignored is if the user sset this explicitly in his configuration
        if self.isGlobalBlacklistIgnored(kwargs) or (hasattr(self.config.TaskWorker, 'ActivitiesToRunEverywhere') and \
                   kwargs['task']['tm_activity'] in self.config.TaskWorker.ActivitiesToRunEverywhere):
            global_blacklist = set()
            self.logger.debug("Ignoring the CRAB site blacklist.")

        sitead = classad.ClassAd()
        siteinfo = {'group_sites': {}, 'group_datasites': {}}

        blocksWithNoLocations = set()
        blocksWithBannedLocations = set()
        allblocks = set()

        siteWhitelist = set(kwargs['task']['tm_site_whitelist'])
        siteBlacklist = set(kwargs['task']['tm_site_blacklist'])
        self.logger.debug("Site whitelist: %s" % (list(siteWhitelist)))
        self.logger.debug("Site blacklist: %s" % (list(siteBlacklist)))

        if siteWhitelist & global_blacklist:
            msg  = "The following sites from the user site whitelist are blacklisted by the CRAB server: %s." % (list(siteWhitelist & global_blacklist))
            msg += " Since the CRAB server blacklist has precedence, these sites are not considered in the user whitelist."
            self.uploadWarning(msg, kwargs['task']['user_proxy'], kwargs['task']['tm_taskname'])
            self.logger.warning(msg)

        if siteBlacklist & siteWhitelist:
            msg  = "The following sites appear in both the user site blacklist and whitelist: %s." % (list(siteBlacklist & siteWhitelist))
            msg += " Since the whitelist has precedence, these sites are not considered in the blacklist."
            self.uploadWarning(msg, kwargs['task']['user_proxy'], kwargs['task']['tm_taskname'])
            self.logger.warning(msg)

        ignoreLocality = kwargs['task']['tm_ignore_locality'] == 'T'
        self.logger.debug("Ignore locality: %s" % (ignoreLocality))

        for jobgroup in splitterResult[0]:
            self.logger.error(dir(jobgroup))
            self.logger.error(type(jobgroup))
            jobs = jobgroup.getJobs()

            jgblocks = set() #job group blocks
            for job in jobs:
                for inputfile in job['input_files']:
                    jgblocks.add(inputfile['block'])
                    allblocks.add(inputfile['block'])
            self.logger.debug("Blocks: %s" % list(jgblocks))

            if not jobs:
                locations = set()
            else:
                locations = set(jobs[0]['input_files'][0]['locations'])
            self.logger.debug("Locations: %s" % (list(locations)))

            ## Discard the jgblocks that have no locations. This can happen when a block is
            ## still open in PhEDEx. Newly created datasets from T0 (at least) have a large
            ## chance of having some block which is closed in DBS but not in PhEDEx.
            ## Open jgblocks in PhEDEx can have a location; it is WMCore who is returning no
            ## location.
            ## This is how a block is constructed during data taking:
            ## 1) an open block in T0 is injected in PhEDEx;
            ## 2) files are being added to the block in T0;
            ## 3) data are transferred by PhEDEx if a subscription is present;
            ## 4) once the block is finished:
            ##   a) the block is inserted into DBS as a closed block (before this, DBS has
            ##      no knowledge about the block);
            ##   b) block is closed in PhEDEx.
            if not locations and not ignoreLocality:
                blocksWithNoLocations = blocksWithNoLocations.union(jgblocks)
                continue

            if ignoreLocality:
                sbj = SiteDB.SiteDBJSON({"key": self.config.TaskWorker.cmskey,
                                         "cert": self.config.TaskWorker.cmscert})
                try:
                    possiblesites = set(sbj.getAllCMSNames())
                except Exception as ex:
                    msg  = "The CRAB3 server backend could not contact SiteDB to get the list of all CMS sites."
                    msg += " This could be a temporary SiteDB glitch."
                    msg += " Please try to submit a new task (resubmit will not work)"
                    msg += " and contact the experts if the error persists."
                    msg += "\nError reason: %s" % (str(ex)) #TODO add the sitedb url so the user can check themselves!
                    raise TaskWorker.WorkerExceptions.TaskWorkerException(msg)
            else:
                possiblesites = locations
            ## At this point 'possiblesites' should never be empty.
            self.logger.debug("Possible sites: %s" % (list(possiblesites)))

            ## Apply the global site blacklist.
            availablesites = possiblesites - global_blacklist

            ## See https://github.com/dmwm/CRABServer/issues/5241
            ## for a discussion about blocksWithBannedLocations
            if not availablesites:
                blocksWithBannedLocations = blocksWithBannedLocations.union(jgblocks)
                continue

            # NOTE: User can still shoot themselves in the foot with the resubmit blacklist
            # However, this is the last chance we have to warn the users about an impossible task at submit time.
            available = set(availablesites)
            if siteWhitelist:
                available &= siteWhitelist
                if not available:
                    blocksWithBannedLocations = blocksWithBannedLocations.union(jgblocks)
            available -= (siteBlacklist - siteWhitelist)
            if not available:
                blocksWithBannedLocations = blocksWithBannedLocations.union(jgblocks)
                continue

            availablesites = [str(i) for i in availablesites]
            datasites = jobs[0]['input_files'][0]['locations']
            self.logger.info("Resulting available sites: %s" % (list(availablesites)))

            if siteWhitelist or siteBlacklist:
                msg  = "The site whitelist and blacklist will be applied by the pre-job."
                msg += " This is expected to result in DESIRED_SITES = %s" % (list(available))
                self.logger.debug(msg)

            jobgroupDagSpecs, startjobid = self.makeDagSpecs(kwargs['task'], sitead, siteinfo, jobgroup, list(jgblocks)[0], availablesites, datasites, outfiles, startjobid, subjob=subjob, stage=stage)
            dagSpecs += jobgroupDagSpecs

        def getBlacklistMsg():
            if len(global_blacklist)!=0:
                tmp = " Global CRAB3 blacklist is %s.\n" % global_blacklist
            if len(siteBlacklist)!=0:
                tmp += " User blacklist is %s.\n" % siteBlacklist
            if len(siteWhitelist)!=0:
                tmp += " User whitelist is %s.\n" % siteWhitelist
            return tmp

        if not dagSpecs:
            msg = "No jobs created for task %s." % (kwargs['task']['tm_taskname'])
            if blocksWithNoLocations or blocksWithBannedLocations:
                msg  = "The CRAB server backend refuses to send jobs to the Grid scheduler. "
                msg += "No locations found for dataset '%s'. " % (kwargs['task']['tm_input_dataset'])
                msg += "(or at least for the part of the dataset that passed the lumi-mask and/or run-range selection).\n"
            if blocksWithBannedLocations:
                msg += " Found %s (out of %s) blocks present only at blacklisted sites." % (len(blocksWithBannedLocations), len(allblocks))
                msg += getBlacklistMsg()
            raise TaskWorker.WorkerExceptions.NoAvailableSite(msg)
        msg = "Some blocks from dataset '%s' were skipped " % (kwargs['task']['tm_input_dataset'])
        if blocksWithNoLocations:
            msgBlocklist = sorted(list(blocksWithNoLocations[:10])) + ['...']
            msg += " because they have no locations.\n List is (first 10 elements only): %s.\n" % msgBlocklist
        if blocksWithBannedLocations:
            msg += " because they are only present at blacklisted sites.\n List is: %s.\n" % (sorted(list(blocksWithBannedLocations)))
            msg += getBlacklistMsg()
        if blocksWithNoLocations or blocksWithBannedLocations:
            msg += (" Dataset processing will be incomplete because %s (out of %s) blocks are only present at blacklisted site(s)" %
                (len(blocksWithNoLocations)+len(blocksWithBannedLocations), len(allblocks)))
            self.uploadWarning(msg, kwargs['task']['user_proxy'], kwargs['task']['tm_taskname'])
            self.logger.warning(msg)

        ## Write down the DAG as needed by DAGMan.
        dag = DAG_HEADER.format(
                nodestate='' if not parent else '.{0}'.format(parent),
                resthost=kwargs['task']['resthost'],
                resturiwfdb=kwargs['task']['resturinoapi'] + '/workflowdb')
        if stage == 'probe':
            # We want only one probe job
            dagSpecs = dagSpecs[:1]
        for dagSpec in dagSpecs:
            dag += DAG_FRAGMENT.format(**dagSpec)
            if stage == 'probe' or (stage == 'process' and kwargs['task']['completion_jobs']):
                dag += SUBDAG_FRAGMENT.format(**dagSpec)
                subdag = "RunJobs{0}.subdag".format(dagSpec['count'])
                with open(subdag, "w") as fd:
                    fd.write("")
                subdags.append(subdag)

        ## Create a tarball with all the job lumi files.
        with getLock('splitting_data'):
            self.logger.debug("Acquired lock on run and lumi tarball")

            try:
                tempDir = tempfile.mkdtemp()
                tempDir2 = tempfile.mkdtemp()

                try:
                    tfd = tarfile.open('run_and_lumis.tar.gz', 'r:gz')
                    tfd.extractall(tempDir)
                    tfd.close()
                except (tarfile.ReadError, IOError):
                    self.logger.debug("First iteration: creating run and lumi from scratch")
                try:
                    tfd2 = tarfile.open('input_files.tar.gz', 'r:gz')
                    tfd2.extractall(tempDir2)
                    tfd2.close()
                except (tarfile.ReadError, IOError):
                    self.logger.debug("First iteration: creating inputfiles from scratch")
                tfd = tarfile.open('run_and_lumis.tar.gz', 'w:gz')
                tfd2 = tarfile.open('input_files.tar.gz', 'w:gz')
                for dagSpec in dagSpecs:
                    job_lumis_file = os.path.join(tempDir, 'job_lumis_'+ str(dagSpec['count']) +'.json')
                    with open(job_lumis_file, "w") as fd:
                        fd.write(str(dagSpec['runAndLumiMask']))
                    ## Also creating a tarball with the dataset input files.
                    ## Each .txt file in the tarball contains a list of dataset files to be used for the job.
                    job_input_file_list = os.path.join(tempDir2, 'job_input_file_list_'+ str(dagSpec['count']) +'.txt')
                    with open(job_input_file_list, "w") as fd2:
                        fd2.write(str(dagSpec['inputFiles']))
            finally:
                tfd.add(tempDir, arcname='')
                tfd.close()
                shutil.rmtree(tempDir)
                tfd2.add(tempDir2, arcname='')
                tfd2.close()
                shutil.rmtree(tempDir2)

        if stage in ('probe', 'conventional'):
            name = "RunJobs.dag"
            ## Cache data discovery
            with open("datadiscovery.pkl", "wb") as fd:
                pickle.dump(splitterResult[1], fd)

            ## Cache task information
            with open("taskinformation.pkl", "wb") as fd:
                pickle.dump(kwargs['task'], fd)
        else:
            name = "RunJobs{0}.subdag".format(parent)

        if stage != 'tail':
            ## Cache site information
            with open("site.ad", "w") as fd:
                fd.write(str(sitead))

            with open("site.ad.json", "w") as fd:
                json.dump(siteinfo, fd)

        ## Save the DAG into a file.
        with open(name, "w") as fd:
            fd.write(dag)

        task_name = kwargs['task'].get('CRAB_ReqName', kwargs['task'].get('tm_taskname', ''))
        userdn = kwargs['task'].get('CRAB_UserDN', kwargs['task'].get('tm_user_dn', ''))

        info["jobcount"] = len(dagSpecs)
        maxpost = getattr(self.config.TaskWorker, 'maxPost', 20)
        if maxpost == -1:
            maxpost = info['jobcount']
        elif maxpost == 0:
            maxpost = int(max(20, info['jobcount']*.1))
        info['maxpost'] = maxpost

        if info.get('faillimit') == None:
            info['faillimit'] = -1
            #if info['jobcount'] > 200
            #    info['faillimit'] = 100
            #else:
            #    info['faillimit'] = -1
        elif info.get('faillimit') < 0:
            info['faillimit'] = -1

        # Info for ML:
        target_se = ''
        max_len_target_se = 900
        for site in map(str, availablesites):
            if len(target_se) > max_len_target_se:
                target_se += ',Many_More'
                break
            if len(target_se):
                target_se += ','
            target_se += site
        ml_info = info.setdefault('apmon', [])
        shift = 0 if stage == 'probe' else 1
        for idx in range(shift, info['jobcount']+shift):
            taskid = kwargs['task']['tm_taskname']
            jinfo = {'broker': os.environ.get('HOSTNAME', ''),
                     'bossId': str(idx),
                     'TargetSE': target_se,
                     'localId': '',
                     'StatusValue': 'pending',
                    }
            insertJobIdSid(jinfo, idx, taskid, 0)
            ml_info.append(jinfo)

        # When running in standalone mode, we want to record the number of jobs in the task
        if ('CRAB_ReqName' in kwargs['task']) and ('CRAB_UserDN' in kwargs['task']):
            const = 'TaskType =?= \"ROOT\" && CRAB_ReqName =?= "%s" && CRAB_UserDN =?= "%s"' % (task_name, userdn)
            cmd = "condor_qedit -const '%s' CRAB_JobCount %d" % (const, len(dagSpecs))
            self.logger.debug("+ %s" % cmd)
            status, output = commands.getstatusoutput(cmd)
            if status:
                self.logger.error(output)
                self.logger.error("Failed to record the number of jobs.")
                return 1

        return info, splitterResult, subdags
Exemplo n.º 9
0
    def killTransfers(self, apmon):
        self.logger.info("About to kill transfers from workflow %s." % self.workflow)
        asourl = self.task.get('tm_asourl', None)
        #let's default asodb to asynctransfer, for old task this is empty!
        #Probably tm_asodb is always there and the get is not necessary, but let's not assume this
        asodb = self.task.get('tm_asodb', 'asynctransfer') or 'asynctransfer'

        if not asourl:
            self.logger.info("ASO URL not set; will not kill transfers")
            return False

        try:
            hostname = socket.getfqdn()
        except:
            hostname = ''

        server = CMSCouch.CouchServer(dburl=asourl, ckey=self.proxy, cert=self.proxy)
        try:
            db = server.connectDatabase(asodb)
        except Exception as ex:
            msg =  "Error while connecting to asynctransfer CouchDB"
            self.logger.exception(msg)
            raise TaskWorkerException(msg)
        self.queryKill = {'reduce':False, 'key':self.workflow, 'include_docs': True}
        try:
            filesKill = db.loadView('AsyncTransfer', 'forKill', self.queryKill)['rows']
        except Exception as ex:
            msg =  "Error while connecting to asynctransfer CouchDB"
            self.logger.exception(msg)
            raise TaskWorkerException(msg)
        if len(filesKill) == 0:
            self.logger.warning('No files to kill found')
        for idt in filesKill:
            now = str(datetime.datetime.now())
            id = idt['value']
            data = {
                'end_time': now,
                'state': 'killed',
                'last_update': time.time(),
                'retry': now,
               }
            updateUri = "/%s/_design/AsyncTransfer/_update/updateJobs/%s?%s" % (db.name, id, urllib.urlencode(data))
            jobid = idt.get('jobid')
            jobretry = idt.get('job_retry_count')
            if not self.task['kill_all']:
                if idt.get("jobid") not in self.task['kill_ids']:
                    continue
            self.logger.info("Killing transfer %s (job ID %s; job retry %s)." % (id, str(jobid), str(jobretry)))
            jobid = str(jobid)
            jobretry = str(jobretry)
            if jobid and jobretry != None:
                jinfo = {'broker': hostname,
                         'bossId': jobid,
                         'StatusValue': 'killed',
                        }
                insertJobIdSid(jinfo, jobid, self.workflow, jobretry)
                self.logger.info("Sending kill info to Dashboard: %s" % str(jinfo))
                apmon.sendToML(jinfo)
            try:
                db.makeRequest(uri = updateUri, type = "PUT", decode = False)
            except Exception as ex:
                msg =  "Error updating document in couch"
                msg += str(ex)
                msg += str(traceback.format_exc())
                raise TaskWorkerException(msg)
        return True
Exemplo n.º 10
0
    def killTransfers(self, apmon):
        self.logger.info("About to kill transfers from workflow %s." % self.workflow)
        ASOURL = self.task.get('tm_asourl', None)
        if not ASOURL:
            self.logger.info("ASO URL not set; will not kill transfers")
            return False

        try:
            hostname = socket.getfqdn()
        except:
            hostname = ''

        server = CMSCouch.CouchServer(dburl=ASOURL, ckey=self.proxy, cert=self.proxy)
        try:
            db = server.connectDatabase('asynctransfer')
        except Exception as ex:
            msg =  "Error while connecting to asynctransfer CouchDB"
            self.logger.exception(msg)
            raise TaskWorkerException(msg)
        self.queryKill = {'reduce':False, 'key':self.workflow, 'include_docs': True}
        try:
            filesKill = db.loadView('AsyncTransfer', 'forKill', self.queryKill)['rows']
        except Exception as ex:
            msg =  "Error while connecting to asynctransfer CouchDB"
            self.logger.exception(msg)
            raise TaskWorkerException(msg)
        if len(filesKill) == 0:
            self.logger.warning('No files to kill found')
        for idt in filesKill:
            now = str(datetime.datetime.now())
            id = idt['value']
            data = {
                'end_time': now,
                'state': 'killed',
                'last_update': time.time(),
                'retry': now,
               }
            updateUri = "/%s/_design/AsyncTransfer/_update/updateJobs/%s?%s" % (db.name, id, urllib.urlencode(data))
            jobid = idt.get('jobid')
            jobretry = idt.get('job_retry_count')
            if not self.task['kill_all']:
                if idt.get("jobid") not in self.task['kill_ids']:
                    continue
            self.logger.info("Killing transfer %s (job ID %s; job retry %s)." % (id, str(jobid), str(jobretry)))
            jobid = str(jobid)
            jobretry = str(jobretry)
            if jobid and jobretry != None:
                jinfo = {'broker': hostname,
                         'bossId': jobid,
                         'StatusValue': 'killed',
                        }
                insertJobIdSid(jinfo, jobid, self.workflow, jobretry)
                self.logger.info("Sending kill info to Dashboard: %s" % str(jinfo))
                apmon.sendToML(jinfo)
            try:
                db.makeRequest(uri = updateUri, type = "PUT", decode = False)
            except Exception as ex:
                msg =  "Error updating document in couch"
                msg += str(ex)
                msg += str(traceback.format_exc())
                raise TaskWorkerException(msg)
        return True