Exemplo n.º 1
0
    def updateOutputFilesXMLWithSURLs4NG(self, experiment, siteWorkdir, jobId, outputFilesXML):
        """ Update the OutputFiles.xml file with SURLs """

        status = False

        # open and read back the OutputFiles.xml file
        _filename = os.path.join(siteWorkdir, outputFilesXML)
        if os.path.exists(_filename):
            try:
                f = open(_filename, "r")
            except Exception, e:
                tolog("!!WARNING!!1990!! Could not open file %s: %s" % (_filename, e))
            else:
                # get the metadata
                xmlIN = f.read()
                f.close()

                # update the XML
                xmlOUT = updateXMLWithSURLs(experiment, xmlIN, siteWorkdir, jobId, self.__jobrec, format='NG')

                # write the XML
                try:
                    f = open(_filename, "w")
                except OSError, e:
                    tolog("!!WARNING!!1990!! Could not open file %s: %s" % (_filename, e))
                else:
Exemplo n.º 2
0
    def updateOutputFilesXMLWithSURLs4NG(self, experiment, siteWorkdir, jobId, outputFilesXML):
        """ Update the OutputFiles.xml file with SURLs """

        status = False

        # open and read back the OutputFiles.xml file
        _filename = os.path.join(siteWorkdir, outputFilesXML)
        if os.path.exists(_filename):
            try:
                f = open(_filename, "r")
            except Exception, e:
                tolog("!!WARNING!!1990!! Could not open file %s: %s" % (_filename, e))
            else:
                # get the metadata
                xmlIN = f.read()
                f.close()

                # update the XML
                xmlOUT = updateXMLWithSURLs(experiment, xmlIN, siteWorkdir, jobId, self.__jobrec, format='NG')

                # write the XML
                try:
                    f = open(_filename, "w")
                except OSError, e:
                    tolog("!!WARNING!!1990!! Could not open file %s: %s" % (_filename, e))
                else:
Exemplo n.º 3
0
        try:
            experiment = job.experiment
        except:
            experiment = "unknown"

        # do not make the update if Nordugrid (leave for ARC to do)
        if os.environ.has_key('Nordugrid_pilot'):
            if final:
                # update xml with SURLs stored in special SURL dictionary file
                if self.updateOutputFilesXMLWithSURLs4NG(experiment, site.workdir, job.jobId, job.outputFilesXML):
                    tolog("Successfully added SURLs to %s" % (job.outputFilesXML))

                # update xml with SURLs stored in special SURL dictionary file
                if node.has_key('xml'):
                    tolog("Updating node structure XML with SURLs")
                    node['xml'] = updateXMLWithSURLs(experiment, node['xml'], site.workdir, job.jobId, self.__jobrec) # do not use format 'NG' here
                else:
                    tolog("WARNING: Found no xml entry in the node structure")

                # store final node structure in pilot_initdir (will be sent to server by ARC control tower)
                self.copyNodeStruct4NG(node)
                tolog("Leaving the final update for the control tower")
            return 0, node

        # do not send xml if there was a put error during the log transfer
        _xml = None
        if final and node.has_key('xml'):
            # update xml with SURLs stored in special SURL dictionary file
            tolog("Updating node structure XML with SURLs")
            node['xml'] = updateXMLWithSURLs(experiment, node['xml'], site.workdir, job.jobId, self.__jobrec)
Exemplo n.º 4
0
        try:
            experiment = job.experiment
        except:
            experiment = "unknown"

        # do not make the update if Nordugrid (leave for ARC to do)
        if os.environ.has_key('Nordugrid_pilot'):
            if final:
                # update xml with SURLs stored in special SURL dictionary file
                if self.updateOutputFilesXMLWithSURLs4NG(experiment, site.workdir, job.jobId, job.outputFilesXML):
                    tolog("Successfully added SURLs to %s" % (job.outputFilesXML))

                # update xml with SURLs stored in special SURL dictionary file
                if node.has_key('xml'):
                    tolog("Updating node structure XML with SURLs")
                    node['xml'] = updateXMLWithSURLs(experiment, node['xml'], site.workdir, job.jobId, self.__jobrec) # do not use format 'NG' here
                else:
                    tolog("WARNING: Found no xml entry in the node structure")

                # store final node structure in pilot_initdir (will be sent to server by ARC control tower)
                self.copyNodeStruct4NG(node)
                tolog("Leaving the final update for the control tower")
            return 0, node

        # do not send xml if there was a put error during the log transfer
        _xml = None
        if final and node.has_key('xml'):
            # update xml with SURLs stored in special SURL dictionary file
            tolog("Updating node structure XML with SURLs")
            node['xml'] = updateXMLWithSURLs(experiment, node['xml'], site.workdir, job.jobId, self.__jobrec)
Exemplo n.º 5
0
    def updatePandaServer(self, job, site, workerNode, port, xmlstr=None, spaceReport=False, log=None, ra=0, jr=False, useCoPilot=False, stdout_tail="", additionalMetadata=None):
        """
        Update the job status with the jobdispatcher web server.
        State is a tuple of (jobId, ["jobstatus", transExitCode, pilotErrorCode], timestamp)
        log = log extracts
        xmlstr is set in postJobTask for finished jobs (all files). Failed jobs will only send xml for log (created in this function)
        jr = job recovery mode
        """
    
        tolog("Updating job status in updatePandaServer(): PandaId=%d, result=%s, time=%s" % (job.getState()))

        # set any holding job to failed for sites that do not use job recovery (e.g. sites with LSF, that immediately
        # removes any work directory after the LSF job finishes which of course makes job recovery impossible)
        if not self.__jobrec:
            if job.result[0] == 'holding' and site.sitename != "CERNVM":
                job.result[0] = 'failed'
                tolog("This site does not support job recovery: HOLDING state reset to FAILED")

        # note: any changed job state above will be lost for fake server updates, does it matter?

        # get the node structure expected by the server
        node = self.getNodeStructure(job, site, workerNode, spaceReport=spaceReport, log=log)

        # skip the server update (e.g. on NG)
        if not self.__updateServer:
            tolog("(fake server update)")
            return 0, node

        # get the xml
        node['xml'] = self.getXML(job, site.sitename, site.workdir, xmlstr=xmlstr, jr=jr)

        # stdout tail in case job.debug == 'true'
        if job.debug.lower() == "true" and stdout_tail != "":
            # protection for potentially large tails
            stdout_tail = stdout_tail[-2048:]
            node['stdout'] = stdout_tail
            tolog("Will send stdout tail:\n%s (length = %d)" % (stdout_tail, len(stdout_tail)))
        else:
            if job.debug.lower() != "true":
                tolog("Stdout tail will not be sent (debug=False)")
            elif stdout_tail == "":
                tolog("Stdout tail will not be sent (no stdout tail)")
            else:
                tolog("Stdout tail will not be sent (debug=%s, stdout_tail=\'%s\')" % (str(job.debug), stdout_tail))

        # PN fake lostheartbeat
        #    if job.result[0] == "finished":
        #        node['state'] = "holding"
        #        node['xml'] = ""

        # read back node['xml'] from jobState file for CERNVM
        sendXML = True
        if site.sitename == "CERNVM":
            _node = self.getNodeStructureFromFile(site.workdir, repr(job.jobId))
            if _node:
                if _node.has_key('xml'):
                    if _node['xml'] != "":
                        node['xml'] = _node['xml']
                        tolog("Read back metadata xml from job state file (length: %d)" % len(node['xml']))
                    else:
                        tolog("No metadata xml present in current job state file (1 - pilot should not send xml at this time)")
                        sendXML = False
                else:
                    tolog("No xml key in node structure")
                    sendXML = False
            else:
                tolog("No metadata xml present in current job state file (2 - pilot should not send xml at this time)")
                sendXML = False

            # change the state to holding for initial CERNVM job
            if not sendXML and (job.result[0] == "finished" or job.result[0] == "failed"):
                # only set the holding state if the Co-Pilot is used
                if useCoPilot:
                    job.result[0] = "holding"
                    node['state'] = "holding"

        # update job state file
        _retjs = updateJobState(job, site, node, recoveryAttempt=ra)

        # is it the final update?
        if job.result[0] == 'finished' or job.result[0] == 'failed' or job.result[0] == 'holding':
            final = True
        else:
            final = False

        # send the original xml if it exists (end of production job)
        filenameAthenaXML = "%s/metadata-%s.xml.ATHENA" % (site.workdir, repr(job.jobId))
        athenaXMLProblem = False
        if os.path.exists(filenameAthenaXML) and final:

            # get the metadata
            AthenaXML = getMetadata(site.workdir, job.jobId, athena=True)

            # add the metadata to the node
            if AthenaXML != "" and AthenaXML != None:
                tolog("Adding Athena metadata of size %d to node dictionary:\n%s" % (len(AthenaXML), AthenaXML))
                node['metaData'] = AthenaXML
            else:
                pilotErrorDiag = "Empty Athena metadata in file: %s" % (filenameAthenaXML)
                athenaXMLProblem = True
        else:
            # athena XML should exist at the end of the job
            if job.result[0] == 'finished' and 'Install' not in site.sitename and 'ANALY' not in site.sitename and 'DDM' not in site.sitename and 'test' not in site.sitename:
                pilotErrorDiag = "Metadata does not exist: %s" % (filenameAthenaXML)
                athenaXMLProblem = True

        # fail the job if there was a problem with the athena metadata
        # remove the comments below if a certain trf and release should be excluded from sending metadata
        # trf_exclusions = ['merge_trf.py']
        # release_exclusions = ['14.5.2.4']
        # jobAtlasRelease = getAtlasRelease(job.atlasRelease)
        # if athenaXMLProblem and job.trf.split(",")[-1] not in trf_exclusions and jobAtlasRelease[-1] not in release_exclusions:
        if athenaXMLProblem:
            tolog("!!FAILED!!1300!! %s" % (pilotErrorDiag))
            job.result[0] = "failed"
            job.result[2] = self.__error.ERR_NOATHENAMETADATA
            if node.has_key('pilotLog'):
                node['pilotLog'] += "!!FAILED!!1300!! %s" % (pilotErrorDiag)
            else:
                node['pilotLog'] = "!!FAILED!!1300!! %s" % (pilotErrorDiag)
            node['pilotErrorCode'] = job.result[2]
            node['state'] = job.result[0]

        # for backward compatibility
        try:
            experiment = job.experiment
        except:
            experiment = "unknown"

        # do not make the update if Nordugrid (leave for ARC to do)
        if readpar('region') == 'Nordugrid':
            if final:
                # update xml with SURLs stored in special SURL dictionary file
                if self.updateOutputFilesXMLWithSURLs4NG(experiment, site.workdir, job.jobId, job.outputFilesXML):
                    tolog("Successfully added SURLs to %s" % (job.outputFilesXML))

                # update xml with SURLs stored in special SURL dictionary file
                if node.has_key('xml'):
                    tolog("Updating node structure XML with SURLs")
                    node['xml'] = updateXMLWithSURLs(experiment, node['xml'], site.workdir, job.jobId, self.__jobrec) # do not use format 'NG' here
                else:
                    tolog("WARNING: Found no xml entry in the node structure")

                # store final node structure in pilot_initdir (will be sent to server by ARC control tower)
                self.copyNodeStruct4NG(node)
                tolog("Leaving the final update for the control tower")
            return 0, node

        # do not send xml if there was a put error during the log transfer
        _xml = None
        if final and node.has_key('xml'):
            # update xml with SURLs stored in special SURL dictionary file
            tolog("Updating node structure XML with SURLs")
            node['xml'] = updateXMLWithSURLs(experiment, node['xml'], site.workdir, job.jobId, self.__jobrec)

            _xml = node['xml']
            if not isLogfileCopied(site.workdir):
                tolog("Pilot will not send xml about output files since log was not transferred")
                node['xml'] = ""

        # should XML be sent at this time?
        if not sendXML:
            tolog("Metadata xml will not be sent")
            if node.has_key('xml'):
                if node['xml'] != "":
                    _xml = node['xml']
                    node['xml'] = ""

        # add experiment specific metadata
        if final and additionalMetadata != None:
            tolog("Adding additionalMetadata to node")
            if 'metaData' in node:
                node['metaData'] += additionalMetadata
            else:
                node['metaData'] = additionalMetadata

        # make the actual update, repeatedly if necessary (for the final update)
        ret = makeHTTPUpdate(job.result[0], node, port, url=self.__pshttpurl, path=self.__pilot_initdir)
        if not ret[2]: # data is None for a failed update attempt
            tolog("makeHTTPUpdate returned: %s" % str(ret))
            return 1, None

        tolog("ret = %s" % str(ret))
        data = ret[1]
        tolog("data = %s" % str(data))

        if data.has_key("command"):
            job.action = data['command']

        try:
            awk = data['StatusCode']
        except:
            tolog("!!WARNING!!1300!! Having problem updating job status, set the awk to 1 for now, and continue...")
            awk = "1"
        else:
            tolog("jobDispatcher acknowledged with %s" % (awk))

        # need to have a return code so subprocess knows if update goes ok or not
        ecode = int(awk) # use the awk code from jobdispatcher as the exit code

        # PN fake lostheartbeat
        #    if job.result[0] == "finished":
        #        ecode = 1

        # reset xml in case it was overwritten above for failed log transfers
        if final and node.has_key('xml'):
            node['xml'] = _xml

        return ecode, node # ecode=0 : update OK, otherwise something wrong