def _monitorJobCallback(returnedValue): """Callback when a job has been monitored :param returnedValue: value returned by the _monitorJob method (ftsJob, standard dirac return struct) """ ftsJob, res = returnedValue log = gLogger.getLocalSubLogger("_monitorJobCallback/%s" % ftsJob.jobID) if not res["OK"]: log.error("Error updating job status", res) else: log.debug("Successfully updated job status")
def _treatOperationCallback(returnedValue): """Callback when an operation has been treated :param returnedValue: value returned by the _treatOperation method (ftsOperation, standard dirac return struct) """ operation, res = returnedValue log = gLogger.getLocalSubLogger("_treatOperationCallback/%s" % operation.operationID) if not res["OK"]: log.error("Error treating operation", res) else: log.debug("Successfully treated operation")
def submit(self, context=None, ftsServer=None, ucert=None, pinTime=36000, protocols=None): """submit the job to the FTS server Some attributes are expected to be defined for the submission to work: * type (set by FTS3Operation) * sourceSE (only for Transfer jobs) * targetSE * activity (optional) * priority (optional) * username * userGroup * filesToSubmit * operationID (optional, used as metadata for the job) We also expect the FTSFiles have an ID defined, as it is given as transfer metadata :param pinTime: Time the file should be pinned on disk (used for transfers and staging) Used only if he source SE is a tape storage :param context: fts3 context. If not given, it is created (see ftsServer & ucert param) :param ftsServer: the address of the fts server to submit to. Used only if context is not given. if not given either, use the ftsServer object attribute :param ucert: path to the user certificate/proxy. Might be inferred by the fts cli (see its doc) :param protocols: list of protocols from which we should choose the protocol to use :returns: S_OK([FTSFiles ids of files submitted]) """ log = gLogger.getLocalSubLogger("submit/%s/%s_%s" % (self.operationID, self.sourceSE, self.targetSE)) if not context: if not ftsServer: ftsServer = self.ftsServer context = fts3.Context(endpoint=ftsServer, ucert=ucert, request_class=ftsSSLRequest, verify=False) # Construct the target SURL res = self.__fetchSpaceToken(self.targetSE, self.vo) if not res["OK"]: return res target_spacetoken = res["Value"] allLFNs = [ftsFile.lfn for ftsFile in self.filesToSubmit] if self.type == "Transfer": res = self._constructTransferJob(pinTime, allLFNs, target_spacetoken, protocols=protocols) elif self.type == "Staging": res = self._constructStagingJob(pinTime, allLFNs, target_spacetoken) # elif self.type == 'Removal': # res = self._constructRemovalJob(context, allLFNs, failedLFNs, target_spacetoken) if not res["OK"]: return res job, fileIDsInTheJob = res["Value"] try: self.ftsGUID = fts3.submit(context, job) log.info("Got GUID %s" % self.ftsGUID) # Only increase the amount of attempt # if we succeeded in submitting -> no ! Why did I do that ?? for ftsFile in self.filesToSubmit: ftsFile.attempt += 1 # This should never happen because a file should be "released" # first by the previous job. # But we just print a warning if ftsFile.ftsGUID is not None: log.warn( "FTSFile has a non NULL ftsGUID at job submission time", "FileID: %s existing ftsGUID: %s" % (ftsFile.fileID, ftsFile.ftsGUID), ) # `assign` the file to this job ftsFile.ftsGUID = self.ftsGUID if ftsFile.fileID in fileIDsInTheJob: ftsFile.status = "Submitted" now = datetime.datetime.utcnow().replace(microsecond=0) self.submitTime = now self.lastUpdate = now self.lastMonitor = now except FTS3ClientException as e: log.exception("Error at submission", repr(e)) return S_ERROR("Error at submission: %s" % e) return S_OK(fileIDsInTheJob)
def _treatOperation(self, operation): """Treat one operation: * does the callback if the operation is finished * generate new jobs and submits them :param operation: the operation to treat :return: operation, S_OK()/S_ERROR() """ try: threadID = current_process().name log = gLogger.getLocalSubLogger("treatOperation/%s" % operation.operationID) # If the operation is totally processed # we perform the callback if operation.isTotallyProcessed(): log.debug("FTS3Operation %s is totally processed" % operation.operationID) res = operation.callback() if not res["OK"]: log.error("Error performing the callback", res) log.info("Putting back the operation") dbRes = self.fts3db.persistOperation(operation) if not dbRes["OK"]: log.error("Could not persist operation", dbRes) return operation, res else: log.debug("FTS3Operation %s is not totally processed yet" % operation.operationID) # This flag is set to False if we want to stop the ongoing processing # of an operation, typically when the matching RMS Request has been # canceled (see below) continueOperationProcessing = True # Check the status of the associated RMS Request. # If it is canceled or does not exist anymore then we will not create new FTS3Jobs, and mark # this as FTS3Operation canceled. if operation.rmsReqID: res = ReqClient().getRequestStatus(operation.rmsReqID) if not res["OK"]: # If the Request does not exist anymore if cmpError(res, errno.ENOENT): log.info( "The RMS Request does not exist anymore, canceling the FTS3Operation", "rmsReqID: %s, FTS3OperationID: %s" % (operation.rmsReqID, operation.operationID), ) operation.status = "Canceled" continueOperationProcessing = False else: log.error("Could not get request status", res) return operation, res else: rmsReqStatus = res["Value"] if rmsReqStatus == "Canceled": log.info( "The RMS Request is canceled, canceling the FTS3Operation", "rmsReqID: %s, FTS3OperationID: %s" % (operation.rmsReqID, operation.operationID), ) operation.status = "Canceled" continueOperationProcessing = False if continueOperationProcessing: res = operation.prepareNewJobs( maxFilesPerJob=self.maxFilesPerJob, maxAttemptsPerFile=self.maxAttemptsPerFile ) if not res["OK"]: log.error("Cannot prepare new Jobs", "FTS3Operation %s : %s" % (operation.operationID, res)) return operation, res newJobs = res["Value"] log.debug("FTS3Operation %s: %s new jobs to be submitted" % (operation.operationID, len(newJobs))) for ftsJob in newJobs: res = self._serverPolicy.chooseFTS3Server() if not res["OK"]: log.error(res) continue ftsServer = res["Value"] log.debug("Use %s server" % ftsServer) ftsJob.ftsServer = ftsServer res = self.getFTS3Context(ftsJob.username, ftsJob.userGroup, ftsServer, threadID=threadID) if not res["OK"]: log.error("Could not get context", res) continue context = res["Value"] try: tpcProtocols = operation.fts3Plugin.selectTPCProtocols(ftsJob=ftsJob) except ValueError as e: log.error("Could not select TPC list", repr(e)) continue res = ftsJob.submit(context=context, protocols=tpcProtocols) if not res["OK"]: log.error( "Could not submit FTS3Job", "FTS3Operation %s : %s" % (operation.operationID, res) ) continue operation.ftsJobs.append(ftsJob) submittedFileIds = res["Value"] log.info( "FTS3Operation %s: Submitted job for %s transfers" % (operation.operationID, len(submittedFileIds)) ) # new jobs are put in the DB at the same time res = self.fts3db.persistOperation(operation) if not res["OK"]: log.error("Could not persist operation", res) return operation, res except Exception as e: log.exception("Exception in the thread", repr(e)) return operation, S_ERROR("Exception %s" % repr(e))
def _monitorJob(self, ftsJob): """* query the FTS servers * update the FTSFile status * update the FTSJob status :param ftsJob: FTS job :return: ftsJob, S_OK()/S_ERROR() """ # General try catch to avoid that the tread dies try: threadID = current_process().name log = gLogger.getLocalSubLogger("_monitorJob/%s" % ftsJob.jobID) res = self.getFTS3Context(ftsJob.username, ftsJob.userGroup, ftsJob.ftsServer, threadID=threadID) if not res["OK"]: log.error("Error getting context", res) return ftsJob, res context = res["Value"] res = ftsJob.monitor(context=context) if not res["OK"]: log.error("Error monitoring job", res) # If the job was not found on the server, update the DB if cmpError(res, errno.ESRCH): res = self.fts3db.cancelNonExistingJob(ftsJob.operationID, ftsJob.ftsGUID) return ftsJob, res # { fileID : { Status, Error } } filesStatus = res["Value"] # Specify the job ftsGUID to make sure we do not overwrite # status of files already taken by newer jobs res = self.fts3db.updateFileStatus(filesStatus, ftsGUID=ftsJob.ftsGUID) if not res["OK"]: log.error("Error updating file fts status", "%s, %s" % (ftsJob.ftsGUID, res)) return ftsJob, res upDict = { ftsJob.jobID: { "status": ftsJob.status, "error": ftsJob.error, "completeness": ftsJob.completeness, "operationID": ftsJob.operationID, "lastMonitor": True, } } res = self.fts3db.updateJobStatus(upDict) if ftsJob.status in ftsJob.FINAL_STATES: self.__sendAccounting(ftsJob) return ftsJob, res except Exception as e: log.exception("Exception while monitoring job", repr(e)) return ftsJob, S_ERROR(0, "Exception %s" % repr(e))