Exemplo n.º 1
0
    def recordInCouch(self, jobs, newstate, oldstate, updatesummary = False):
        """
        _recordInCouch_

        Record relevant job information in couch. If the job does not yet exist
        in couch it will be saved as a seperate document.  If the job has a FWJR
        attached that will be saved as a seperate document.
        """
        if not self._connectDatabases():
            logging.error('Databases not connected properly')
            return

        timestamp = int(time.time())
        couchRecordsToUpdate = []

        for job in jobs:
            couchDocID = job.get("couch_record", None)

            if newstate == "new":
                oldstate = "none"

            if job.get("site_cms_name", None):
                if newstate == "executing":
                    jobLocation = job["site_cms_name"]
                else:
                    jobLocation = "Agent"
            else:
                jobLocation = "Agent"

            if couchDocID == None:
                jobDocument = {}
                jobDocument["_id"] = str(job["id"])
                job["couch_record"] = jobDocument["_id"]
                jobDocument["jobid"] = job["id"]
                jobDocument["workflow"] = job["workflow"]
                jobDocument["task"] = job["task"]
                jobDocument["owner"] = job["owner"]

                jobDocument["inputfiles"] = []
                for inputFile in job["input_files"]:
                    docInputFile = inputFile.json()

                    docInputFile["parents"] = []
                    for parent in inputFile["parents"]:
                        docInputFile["parents"].append({"lfn": parent["lfn"]})

                    jobDocument["inputfiles"].append(docInputFile)

                jobDocument["states"] = {"0": {"oldstate": oldstate,
                                               "newstate": newstate,
                                               "location": jobLocation,
                                               "timestamp": timestamp}}

                jobDocument["jobgroup"] = job["jobgroup"]
                jobDocument["mask"] = {"FirstEvent": job["mask"]["FirstEvent"],
                                       "LastEvent": job["mask"]["LastEvent"],
                                       "FirstLumi": job["mask"]["FirstLumi"],
                                       "LastLumi": job["mask"]["LastLumi"],
                                       "FirstRun": job["mask"]["FirstRun"],
                                       "LastRun": job["mask"]["LastRun"]}

                if job['mask']['runAndLumis'] != {}:
                    # Then we have to save the mask runAndLumis
                    jobDocument['mask']['runAndLumis'] = {}
                    for key in job['mask']['runAndLumis'].keys():
                        jobDocument['mask']['runAndLumis'][str(key)] = job['mask']['runAndLumis'][key]

                jobDocument["name"] = job["name"]
                jobDocument["type"] = "job"
                jobDocument["user"] = job.get("user", None)
                jobDocument["group"] = job.get("group", None)
                jobDocument["taskType"] = job.get("taskType", "Unknown")
                jobDocument["jobType"] = job.get("jobType", "Unknown")

                couchRecordsToUpdate.append({"jobid": job["id"],
                                             "couchid": jobDocument["_id"]})
                self.jobsdatabase.queue(jobDocument, callback = discardConflictingDocument)
            else:
                # We send a PUT request to the stateTransition update handler.
                # Couch expects the parameters to be passed as arguments to in
                # the URI while the Requests class will only encode arguments
                # this way for GET requests.  Changing the Requests class to
                # encode PUT arguments as couch expects broke a bunch of code so
                # we'll just do our own encoding here.
                updateUri = "/" + self.jobsdatabase.name + "/_design/JobDump/_update/stateTransition/" + couchDocID
                updateUri += "?oldstate=%s&newstate=%s&location=%s&timestamp=%s" % (oldstate,
                                                                                    newstate,
                                                                                    jobLocation,
                                                                                    timestamp)
                self.jobsdatabase.makeRequest(uri = updateUri, type = "PUT", decode = False)

            # updating the status of the summary doc only when it is explicitely requested
            # doc is already in couch
            if updatesummary:
                jobSummaryId = job["name"]
                updateUri = "/" + self.jsumdatabase.name + "/_design/WMStats/_update/jobSummaryState/" + jobSummaryId
                # map retrydone state to jobfailed state for monitoring
                if newstate == "retrydone":
                    monitorState = "jobfailed"
                else:
                    monitorState = newstate
                updateUri += "?newstate=%s&timestamp=%s" % (monitorState, timestamp)
                self.jsumdatabase.makeRequest(uri = updateUri, type = "PUT", decode = False)
                logging.debug("Updated job summary status for job %s" % jobSummaryId)
                
                updateUri = "/" + self.jsumdatabase.name + "/_design/WMStats/_update/jobStateTransition/" + jobSummaryId
                updateUri += "?oldstate=%s&newstate=%s&location=%s&timestamp=%s" % (oldstate,
                                                                                    monitorState,
                                                                                    job["location"],
                                                                                    timestamp)
                self.jsumdatabase.makeRequest(uri = updateUri, type = "PUT", decode = False)
                logging.debug("Updated job summary state history for job %s" % jobSummaryId)

            if job.get("fwjr", None):

                # If there are too many input files, strip them out
                # of the FWJR, as they should already
                # be in the database
                # This is not critical
                try:
                    if len(job['fwjr'].getAllInputFiles()) > self.maxUploadedInputFiles:
                        job['fwjr'].stripInputFiles()
                except:
                    logging.error("Error while trying to strip input files from FWJR.  Ignoring.")
                    pass

                # complete fwjr document
                job["fwjr"].setTaskName(job["task"])
                fwjrDocument = {"_id": "%s-%s" % (job["id"], job["retry_count"]),
                                "jobid": job["id"],
                                "retrycount": job["retry_count"],
                                "fwjr": job["fwjr"].__to_json__(None),
                                "type": "fwjr"}
                self.fwjrdatabase.queue(fwjrDocument, timestamp = True, callback = discardConflictingDocument)
                updateSummaryDB(self.statsumdatabase, job)

                #TODO: can add config switch to swich on and off
                # if self.config.JobSateMachine.propagateSuccessJobs or (job["retry_count"] > 0) or (newstate != 'success'):
                if (job["retry_count"] > 0) or (newstate != 'success'):
                    jobSummaryId = job["name"]
                    # building a summary of fwjr
                    logging.debug("Pushing job summary for job %s" % jobSummaryId)
                    errmsgs = {}
                    inputs = []
                    if "steps" in fwjrDocument["fwjr"]:
                        for step in fwjrDocument["fwjr"]["steps"]:
                            if "errors" in fwjrDocument["fwjr"]["steps"][step]:
                                errmsgs[step] = [error for error in fwjrDocument["fwjr"]["steps"][step]["errors"]]
                            if "input" in fwjrDocument["fwjr"]["steps"][step] and "source" in fwjrDocument["fwjr"]["steps"][step]["input"]:
                                inputs.extend( [source["runs"] for source in fwjrDocument["fwjr"]['steps'][step]["input"]["source"] if "runs" in source] )
    
                    outputs = []
                    outputDataset = None
                    for singlestep in job["fwjr"].listSteps():
                        for singlefile in job["fwjr"].getAllFilesFromStep(step=singlestep):
                            if singlefile:
                                outputs.append({'type': 'output' if CMSSTEP.match(singlestep) else singlefile.get('module_label', None),
                                                'lfn': singlefile.get('lfn', None),
                                                'location': list(singlefile.get('locations', set([]))) if len(singlefile.get('locations', set([]))) > 1
                                                                                                       else singlefile['locations'].pop(),
                                                'checksums': singlefile.get('checksums', {}),
                                                'size': singlefile.get('size', None) })
                                #it should have one output dataset for all the files
                                outputDataset = singlefile.get('dataset', None) if not outputDataset else outputDataset
                    inputFiles = []
                    for inputFileStruct in job["fwjr"].getAllInputFiles():
                        # check if inputFileSummary needs to be extended
                        inputFileSummary = {}
                        inputFileSummary["lfn"] = inputFileStruct["lfn"]
                        inputFileSummary["input_type"] = inputFileStruct["input_type"]
                        inputFiles.append(inputFileSummary)
                    
                    # Don't record intermediate jobfailed status in the jobsummary
                    # change to jobcooloff which will be overwritten by error handler anyway
                    if (job["retry_count"] > 0) and (newstate == 'jobfailed'):
                        summarystate = 'jobcooloff'
                    else:
                        summarystate = newstate
                           
                    jobSummary = {"_id": jobSummaryId,
                                  "wmbsid": job["id"],
                                  "type": "jobsummary",
                                  "retrycount": job["retry_count"],
                                  "workflow": job["workflow"],
                                  "task": job["task"],
                                  "jobtype": job["jobType"],
                                  "state": summarystate,
                                  "site": job.get("location", None),
                                  "cms_location": job["fwjr"].getSiteName(),
                                  "exitcode": job["fwjr"].getExitCode(),
                                  "errors": errmsgs,
                                  "lumis": inputs,
                                  "outputdataset": outputDataset,
                                  "inputfiles": inputFiles,
                                  "acdc_url": "%s/%s" % (sanitizeURL(self.config.ACDC.couchurl)['url'], self.config.ACDC.database),
                                  "agent_name": self.config.Agent.hostName,
                                  "output": outputs }
                    if couchDocID is not None:
                        try:
                            currentJobDoc = self.jsumdatabase.document(id = jobSummaryId)
                            jobSummary['_rev'] = currentJobDoc['_rev']
                            jobSummary['state_history'] = currentJobDoc.get('state_history', [])
                            # record final status transition
                            if newstate == 'success':
                                finalStateDict = {'oldstate': oldstate,
                                                  'newstate': newstate,
                                                  'location': job["location"],
                                                  'timestamp': timestamp}
                                jobSummary['state_history'].append(finalStateDict)
                                
                            noEmptyList = ["inputfiles", "lumis"]
                            for prop in noEmptyList:
                                jobSummary[prop] = jobSummary[prop] if jobSummary[prop] else currentJobDoc.get(prop, [])
                        except CouchNotFoundError:
                            pass
                    self.jsumdatabase.queue(jobSummary, timestamp = True)

        if len(couchRecordsToUpdate) > 0:
            self.setCouchDAO.execute(bulkList = couchRecordsToUpdate,
                                     conn = self.getDBConn(),
                                     transaction = self.existingTransaction())

        self.jobsdatabase.commit(callback = discardConflictingDocument)
        self.fwjrdatabase.commit(callback = discardConflictingDocument)
        self.jsumdatabase.commit()
        return
Exemplo n.º 2
0
    def recordInCouch(self, jobs, newstate, oldstate, updatesummary=False):
        """
        _recordInCouch_

        Record relevant job information in couch. If the job does not yet exist
        in couch it will be saved as a seperate document.  If the job has a FWJR
        attached that will be saved as a seperate document.
        """
        if not self._connectDatabases():
            logging.error('Databases not connected properly')
            return

        timestamp = int(time.time())
        couchRecordsToUpdate = []

        for job in jobs:
            couchDocID = job.get("couch_record", None)

            if newstate == "new":
                oldstate = "none"

            if job.get("site_cms_name", None):
                if newstate == "executing":
                    jobLocation = job["site_cms_name"]
                else:
                    jobLocation = "Agent"
            else:
                jobLocation = "Agent"

            if couchDocID is None:
                jobDocument = {}
                jobDocument["_id"] = str(job["id"])
                job["couch_record"] = jobDocument["_id"]
                jobDocument["jobid"] = job["id"]
                jobDocument["workflow"] = job["workflow"]
                jobDocument["task"] = job["task"]
                jobDocument["owner"] = job["owner"]

                jobDocument["inputfiles"] = []
                for inputFile in job["input_files"]:
                    docInputFile = inputFile.json()

                    docInputFile["parents"] = []
                    for parent in inputFile["parents"]:
                        docInputFile["parents"].append({"lfn": parent["lfn"]})

                    jobDocument["inputfiles"].append(docInputFile)

                jobDocument["states"] = {"0": {"oldstate": oldstate,
                                               "newstate": newstate,
                                               "location": jobLocation,
                                               "timestamp": timestamp}}

                jobDocument["jobgroup"] = job["jobgroup"]
                jobDocument["mask"] = {"FirstEvent": job["mask"]["FirstEvent"],
                                       "LastEvent": job["mask"]["LastEvent"],
                                       "FirstLumi": job["mask"]["FirstLumi"],
                                       "LastLumi": job["mask"]["LastLumi"],
                                       "FirstRun": job["mask"]["FirstRun"],
                                       "LastRun": job["mask"]["LastRun"]}

                if job['mask']['runAndLumis'] != {}:
                    # Then we have to save the mask runAndLumis
                    jobDocument['mask']['runAndLumis'] = {}
                    for key in job['mask']['runAndLumis'].keys():
                        jobDocument['mask']['runAndLumis'][str(key)] = job['mask']['runAndLumis'][key]

                jobDocument["name"] = job["name"]
                jobDocument["type"] = "job"
                jobDocument["user"] = job.get("user", None)
                jobDocument["group"] = job.get("group", None)
                jobDocument["taskType"] = job.get("taskType", "Unknown")
                jobDocument["jobType"] = job.get("jobType", "Unknown")

                couchRecordsToUpdate.append({"jobid": job["id"],
                                             "couchid": jobDocument["_id"]})
                self.jobsdatabase.queue(jobDocument, callback=discardConflictingDocument)
            else:
                # We send a PUT request to the stateTransition update handler.
                # Couch expects the parameters to be passed as arguments to in
                # the URI while the Requests class will only encode arguments
                # this way for GET requests.  Changing the Requests class to
                # encode PUT arguments as couch expects broke a bunch of code so
                # we'll just do our own encoding here.
                updateUri = "/" + self.jobsdatabase.name + "/_design/JobDump/_update/stateTransition/" + couchDocID
                updateUri += "?oldstate=%s&newstate=%s&location=%s&timestamp=%s" % (oldstate,
                                                                                    newstate,
                                                                                    jobLocation,
                                                                                    timestamp)
                self.jobsdatabase.makeRequest(uri=updateUri, type="PUT", decode=False)

            # updating the status of the summary doc only when it is explicitely requested
            # doc is already in couch
            if updatesummary:
                jobSummaryId = job["name"]
                updateUri = "/" + self.jsumdatabase.name + "/_design/WMStatsAgent/_update/jobSummaryState/" + jobSummaryId
                # map retrydone state to jobfailed state for monitoring
                if newstate == "retrydone":
                    monitorState = "jobfailed"
                else:
                    monitorState = newstate
                updateUri += "?newstate=%s&timestamp=%s" % (monitorState, timestamp)
                self.jsumdatabase.makeRequest(uri=updateUri, type="PUT", decode=False)
                logging.debug("Updated job summary status for job %s", jobSummaryId)

                updateUri = "/" + self.jsumdatabase.name + "/_design/WMStatsAgent/_update/jobStateTransition/" + jobSummaryId
                updateUri += "?oldstate=%s&newstate=%s&location=%s&timestamp=%s" % (oldstate,
                                                                                    monitorState,
                                                                                    job["location"],
                                                                                    timestamp)
                self.jsumdatabase.makeRequest(uri=updateUri, type="PUT", decode=False)
                logging.debug("Updated job summary state history for job %s", jobSummaryId)

            if job.get("fwjr", None):

                cachedByWorkflow = self.workloadCache.setdefault(job['workflow'],
                                                                 getDataFromSpecFile(
                                                                     self.getWorkflowSpecDAO.execute(job['task'])[
                                                                         job['task']]['spec']))
                job['fwjr'].setCampaign(cachedByWorkflow.get('Campaign', ''))
                job['fwjr'].setPrepID(cachedByWorkflow.get(job['task'], ''))
                # If there are too many input files, strip them out
                # of the FWJR, as they should already
                # be in the database
                # This is not critical
                try:
                    if len(job['fwjr'].getAllInputFiles()) > self.maxUploadedInputFiles:
                        job['fwjr'].stripInputFiles()
                except Exception as ex:
                    logging.error("Error while trying to strip input files from FWJR.  Ignoring. : %s", str(ex))

                if newstate == "retrydone":
                    jobState = "jobfailed"
                else:
                    jobState = newstate

                # there is race condition updating couch record location and job is completed.
                # for the fast fail job, it could miss the location update
                job["location"] = job["fwjr"].getSiteName() or job.get("location", "Unknown")
                # complete fwjr document
                job["fwjr"].setTaskName(job["task"])
                jsonFWJR = job["fwjr"].__to_json__(None)

                # Don't archive cleanup job report
                if job["jobType"] == "Cleanup":
                    archStatus = "skip"
                else:
                    archStatus = "ready"

                fwjrDocument = {"_id": "%s-%s" % (job["id"], job["retry_count"]),
                                "jobid": job["id"],
                                "jobtype": job["jobType"],
                                "jobstate": jobState,
                                "retrycount": job["retry_count"],
                                "archivestatus": archStatus,
                                "fwjr": jsonFWJR,
                                "type": "fwjr"}
                self.fwjrdatabase.queue(fwjrDocument, timestamp=True, callback=discardConflictingDocument)

                updateSummaryDB(self.statsumdatabase, job)

                # TODO: can add config switch to swich on and off
                # if self.config.JobSateMachine.propagateSuccessJobs or (job["retry_count"] > 0) or (newstate != 'success'):
                if (job["retry_count"] > 0) or (newstate != 'success'):
                    jobSummaryId = job["name"]
                    # building a summary of fwjr
                    logging.debug("Pushing job summary for job %s", jobSummaryId)
                    errmsgs = {}
                    inputs = []
                    if "steps" in fwjrDocument["fwjr"]:
                        for step in fwjrDocument["fwjr"]["steps"]:
                            if "errors" in fwjrDocument["fwjr"]["steps"][step]:
                                errmsgs[step] = [error for error in fwjrDocument["fwjr"]["steps"][step]["errors"]]
                            if "input" in fwjrDocument["fwjr"]["steps"][step] and "source" in \
                                    fwjrDocument["fwjr"]["steps"][step]["input"]:
                                inputs.extend(
                                    [source["runs"] for source in fwjrDocument["fwjr"]['steps'][step]["input"]["source"]
                                     if "runs" in source])

                    outputs = []
                    outputDataset = None
                    for singlestep in job["fwjr"].listSteps():
                        for singlefile in job["fwjr"].getAllFilesFromStep(step=singlestep):
                            if singlefile:
                                if len(singlefile.get('locations', set())) > 1:
                                    locations = list(singlefile.get('locations'))
                                elif singlefile.get('locations'):
                                    locations = singlefile['locations'].pop()
                                else:
                                    locations = set()
                                if CMSSTEP.match(singlestep):
                                    outType = 'output'
                                else:
                                    outType = singlefile.get('module_label', None)
                                outputs.append({'type': outType,
                                                'lfn': singlefile.get('lfn', None),
                                                'location': locations,
                                                'checksums': singlefile.get('checksums', {}),
                                                'size': singlefile.get('size', None)})
                                # it should have one output dataset for all the files
                                outputDataset = singlefile.get('dataset', None) if not outputDataset else outputDataset
                    inputFiles = []
                    for inputFileStruct in job["fwjr"].getAllInputFiles():
                        # check if inputFileSummary needs to be extended
                        inputFileSummary = {}
                        inputFileSummary["lfn"] = inputFileStruct["lfn"]
                        inputFileSummary["input_type"] = inputFileStruct["input_type"]
                        inputFiles.append(inputFileSummary)

                    # Don't record intermediate jobfailed status in the jobsummary
                    # change to jobcooloff which will be overwritten by error handler anyway
                    if (job["retry_count"] > 0) and (newstate == 'jobfailed'):
                        summarystate = 'jobcooloff'
                    else:
                        summarystate = newstate

                    jobSummary = {"_id": jobSummaryId,
                                  "wmbsid": job["id"],
                                  "type": "jobsummary",
                                  "retrycount": job["retry_count"],
                                  "workflow": job["workflow"],
                                  "task": job["task"],
                                  "jobtype": job["jobType"],
                                  "state": summarystate,
                                  "site": job.get("location", None),
                                  "cms_location": job["fwjr"].getSiteName(),
                                  "exitcode": job["fwjr"].getExitCode(),
                                  "eos_log_url": job["fwjr"].getLogURL(),
                                  "worker_node_info": job["fwjr"].getWorkerNodeInfo(),
                                  "errors": errmsgs,
                                  "lumis": inputs,
                                  "outputdataset": outputDataset,
                                  "inputfiles": inputFiles,
                                  "acdc_url": "%s/%s" % (
                                  sanitizeURL(self.config.ACDC.couchurl)['url'], self.config.ACDC.database),
                                  "agent_name": self.config.Agent.hostName,
                                  "output": outputs}
                    if couchDocID is not None:
                        try:
                            currentJobDoc = self.jsumdatabase.document(id=jobSummaryId)
                            jobSummary['_rev'] = currentJobDoc['_rev']
                            jobSummary['state_history'] = currentJobDoc.get('state_history', [])
                            # record final status transition
                            if newstate == 'success':
                                finalStateDict = {'oldstate': oldstate,
                                                  'newstate': newstate,
                                                  'location': job["location"],
                                                  'timestamp': timestamp}
                                jobSummary['state_history'].append(finalStateDict)

                            noEmptyList = ["inputfiles", "lumis"]
                            for prop in noEmptyList:
                                jobSummary[prop] = jobSummary[prop] if jobSummary[prop] else currentJobDoc.get(prop, [])
                        except CouchNotFoundError:
                            pass
                    self.jsumdatabase.queue(jobSummary, timestamp=True)

        if len(couchRecordsToUpdate) > 0:
            self.setCouchDAO.execute(bulkList=couchRecordsToUpdate,
                                     conn=self.getDBConn(),
                                     transaction=self.existingTransaction())

        self.jobsdatabase.commit(callback=discardConflictingDocument)
        self.fwjrdatabase.commit(callback=discardConflictingDocument)
        self.jsumdatabase.commit()
        return