params = {} params['pid'] = job.jobId params['line'] = 0 # this is mandatory part of API, has to be present params['type'] = 'FAXrecovery' params['message'] = '"WithFAX":' + str(job.filesWithFAX) +\ ',"WithoutFAX":' + str(job.filesWithoutFAX) +\ ',"bytesWithFAX":' + str(job.bytesWithFAX) +\ ',"bytesWithoutFAX":' + str(job.bytesWithoutFAX) +\ ',"timeToCopy":' + job.timeStageIn toPandaLogger(params) # make the actual update, repeatedly if necessary (for the final update) #ret = makeHTTPUpdate(job.result[0], node, port, url=self.__pshttpurl, path=self.__pilot_initdir) if job.workdir.endswith("/"): job.workdir = job.workdir[:-1] ret = makeHTTPUpdate(job.result[0], node, port, url=self.__pshttpurl, path=os.path.dirname(job.workdir)) if not ret[2]: # data is None for a failed update attempt tolog("makeHTTPUpdate returned: %s" % str(ret)) return 1, None tolog("ret = %s" % str(ret)) data = ret[1] tolog("data = %s" % str(data)) if data.has_key("command"): job.action = data['command'] try: awk = data['StatusCode'] except: tolog("!!WARNING!!1300!! Having problem updating job status, set the awk to 1 for now, and continue...")
def updatePandaServer(self, job, site, workerNode, port, xmlstr=None, spaceReport=False, log=None, ra=0, jr=False, useCoPilot=False, stdout_tail="", additionalMetadata=None): """ Update the job status with the jobdispatcher web server. State is a tuple of (jobId, ["jobstatus", transExitCode, pilotErrorCode], timestamp) log = log extracts xmlstr is set in postJobTask for finished jobs (all files). Failed jobs will only send xml for log (created in this function) jr = job recovery mode """ tolog("Updating job status in updatePandaServer(): PandaId=%d, result=%s, time=%s" % (job.getState())) # set any holding job to failed for sites that do not use job recovery (e.g. sites with LSF, that immediately # removes any work directory after the LSF job finishes which of course makes job recovery impossible) if not self.__jobrec: if job.result[0] == 'holding' and site.sitename != "CERNVM": job.result[0] = 'failed' tolog("This site does not support job recovery: HOLDING state reset to FAILED") # note: any changed job state above will be lost for fake server updates, does it matter? # get the node structure expected by the server node = self.getNodeStructure(job, site, workerNode, spaceReport=spaceReport, log=log) # skip the server update (e.g. on NG) if not self.__updateServer: tolog("(fake server update)") return 0, node # get the xml node['xml'] = self.getXML(job, site.sitename, site.workdir, xmlstr=xmlstr, jr=jr) # stdout tail in case job.debug == 'true' if job.debug.lower() == "true" and stdout_tail != "": # protection for potentially large tails stdout_tail = stdout_tail[-2048:] node['stdout'] = stdout_tail tolog("Will send stdout tail:\n%s (length = %d)" % (stdout_tail, len(stdout_tail))) else: if job.debug.lower() != "true": tolog("Stdout tail will not be sent (debug=False)") elif stdout_tail == "": tolog("Stdout tail will not be sent (no stdout tail)") else: tolog("Stdout tail will not be sent (debug=%s, stdout_tail=\'%s\')" % (str(job.debug), stdout_tail)) # PN fake lostheartbeat # if job.result[0] == "finished": # node['state'] = "holding" # node['xml'] = "" # read back node['xml'] from jobState file for CERNVM sendXML = True if site.sitename == "CERNVM": _node = self.getNodeStructureFromFile(site.workdir, repr(job.jobId)) if _node: if _node.has_key('xml'): if _node['xml'] != "": node['xml'] = _node['xml'] tolog("Read back metadata xml from job state file (length: %d)" % len(node['xml'])) else: tolog("No metadata xml present in current job state file (1 - pilot should not send xml at this time)") sendXML = False else: tolog("No xml key in node structure") sendXML = False else: tolog("No metadata xml present in current job state file (2 - pilot should not send xml at this time)") sendXML = False # change the state to holding for initial CERNVM job if not sendXML and (job.result[0] == "finished" or job.result[0] == "failed"): # only set the holding state if the Co-Pilot is used if useCoPilot: job.result[0] = "holding" node['state'] = "holding" # update job state file _retjs = updateJobState(job, site, node, recoveryAttempt=ra) # is it the final update? if job.result[0] == 'finished' or job.result[0] == 'failed' or job.result[0] == 'holding': final = True else: final = False # send the original xml if it exists (end of production job) filenameAthenaXML = "%s/metadata-%s.xml.ATHENA" % (site.workdir, repr(job.jobId)) athenaXMLProblem = False if os.path.exists(filenameAthenaXML) and final: # get the metadata AthenaXML = getMetadata(site.workdir, job.jobId, athena=True) # add the metadata to the node if AthenaXML != "" and AthenaXML != None: tolog("Adding Athena metadata of size %d to node dictionary:\n%s" % (len(AthenaXML), AthenaXML)) node['metaData'] = AthenaXML else: pilotErrorDiag = "Empty Athena metadata in file: %s" % (filenameAthenaXML) athenaXMLProblem = True else: # athena XML should exist at the end of the job if job.result[0] == 'finished' and 'Install' not in site.sitename and 'ANALY' not in site.sitename and 'DDM' not in site.sitename and 'test' not in site.sitename: pilotErrorDiag = "Metadata does not exist: %s" % (filenameAthenaXML) athenaXMLProblem = True # fail the job if there was a problem with the athena metadata # remove the comments below if a certain trf and release should be excluded from sending metadata # trf_exclusions = ['merge_trf.py'] # release_exclusions = ['14.5.2.4'] # jobAtlasRelease = getAtlasRelease(job.atlasRelease) # if athenaXMLProblem and job.trf.split(",")[-1] not in trf_exclusions and jobAtlasRelease[-1] not in release_exclusions: if athenaXMLProblem: tolog("!!FAILED!!1300!! %s" % (pilotErrorDiag)) job.result[0] = "failed" job.result[2] = self.__error.ERR_NOATHENAMETADATA if node.has_key('pilotLog'): node['pilotLog'] += "!!FAILED!!1300!! %s" % (pilotErrorDiag) else: node['pilotLog'] = "!!FAILED!!1300!! %s" % (pilotErrorDiag) node['pilotErrorCode'] = job.result[2] node['state'] = job.result[0] # for backward compatibility try: experiment = job.experiment except: experiment = "unknown" # do not make the update if Nordugrid (leave for ARC to do) if readpar('region') == 'Nordugrid': if final: # update xml with SURLs stored in special SURL dictionary file if self.updateOutputFilesXMLWithSURLs4NG(experiment, site.workdir, job.jobId, job.outputFilesXML): tolog("Successfully added SURLs to %s" % (job.outputFilesXML)) # update xml with SURLs stored in special SURL dictionary file if node.has_key('xml'): tolog("Updating node structure XML with SURLs") node['xml'] = updateXMLWithSURLs(experiment, node['xml'], site.workdir, job.jobId, self.__jobrec) # do not use format 'NG' here else: tolog("WARNING: Found no xml entry in the node structure") # store final node structure in pilot_initdir (will be sent to server by ARC control tower) self.copyNodeStruct4NG(node) tolog("Leaving the final update for the control tower") return 0, node # do not send xml if there was a put error during the log transfer _xml = None if final and node.has_key('xml'): # update xml with SURLs stored in special SURL dictionary file tolog("Updating node structure XML with SURLs") node['xml'] = updateXMLWithSURLs(experiment, node['xml'], site.workdir, job.jobId, self.__jobrec) _xml = node['xml'] if not isLogfileCopied(site.workdir): tolog("Pilot will not send xml about output files since log was not transferred") node['xml'] = "" # should XML be sent at this time? if not sendXML: tolog("Metadata xml will not be sent") if node.has_key('xml'): if node['xml'] != "": _xml = node['xml'] node['xml'] = "" # add experiment specific metadata if final and additionalMetadata != None: tolog("Adding additionalMetadata to node") if 'metaData' in node: node['metaData'] += additionalMetadata else: node['metaData'] = additionalMetadata # make the actual update, repeatedly if necessary (for the final update) ret = makeHTTPUpdate(job.result[0], node, port, url=self.__pshttpurl, path=self.__pilot_initdir) if not ret[2]: # data is None for a failed update attempt tolog("makeHTTPUpdate returned: %s" % str(ret)) return 1, None tolog("ret = %s" % str(ret)) data = ret[1] tolog("data = %s" % str(data)) if data.has_key("command"): job.action = data['command'] try: awk = data['StatusCode'] except: tolog("!!WARNING!!1300!! Having problem updating job status, set the awk to 1 for now, and continue...") awk = "1" else: tolog("jobDispatcher acknowledged with %s" % (awk)) # need to have a return code so subprocess knows if update goes ok or not ecode = int(awk) # use the awk code from jobdispatcher as the exit code # PN fake lostheartbeat # if job.result[0] == "finished": # ecode = 1 # reset xml in case it was overwritten above for failed log transfers if final and node.has_key('xml'): node['xml'] = _xml return ecode, node # ecode=0 : update OK, otherwise something wrong