def getFTS3Context(self, username, group, ftsServer, threadID): """ Returns an fts3 context for a given user, group and fts server The context pool is per thread, and there is one context per tuple (user, group, server). We dump the proxy of a user to a file (shared by all the threads), and use it to make the context. The proxy needs a lifetime of at least 2h, is cached for 1.5h, and the lifetime of the context is 45mn :param username: name of the user :param group: group of the user :param ftsServer: address of the server :returns: S_OK with the context object """ log = gLogger.getSubLogger("getFTS3Context", child=True) contextes = self._globalContextCache.setdefault(threadID, DictCache()) idTuple = (username, group, ftsServer) log.debug("Getting context for %s" % (idTuple, )) if not contextes.exists(idTuple, 2700): res = getDNForUsername(username) if not res['OK']: return res # We take the first DN returned userDN = res['Value'][0] log.debug("UserDN %s" % userDN) # We dump the proxy to a file. # It has to have a lifetime of at least 2 hours # and we cache it for 1.5 hours res = gProxyManager.downloadVOMSProxyToFile( userDN, group, requiredTimeLeft=7200, cacheTime=5400) if not res['OK']: return res proxyFile = res['Value'] log.debug("Proxy file %s" % proxyFile) # We generate the context res = FTS3Job.generateContext(ftsServer, proxyFile) if not res['OK']: return res context = res['Value'] # we add it to the cache for this thread for 1h contextes.add(idTuple, 3600, context) return S_OK(contextes.get(idTuple))
def _createNewJob(self, jobType, ftsFiles, targetSE, sourceSE=None): """ Create a new FTS3Job object :param jobType: type of job to create (Transfer, Staging, Removal) :param ftsFiles: list of FTS3File objects the job has to work on :param targetSE: SE on which to operate :param sourceSE: source SE, only useful for Transfer jobs :return FTS3Job object """ newJob = FTS3Job() newJob.type = jobType newJob.sourceSE = sourceSE newJob.targetSE = targetSE newJob.activity = self.activity newJob.priority = self.priority newJob.username = self.username newJob.userGroup = self.userGroup newJob.vo = self.vo newJob.filesToSubmit = ftsFiles newJob.operationID = getattr(self, 'operationID') return newJob
def test_raceCondition(fts3db): """This tests a race condition that was exhibited when running multiple agent in parallel. What was happening was that we were getting some nonFinishedOperations for further processing while some jobs associated to that operation were being monitored. This test reproduces all the possible combination of job/operation being assigned/non assigned | ---- | ---------- | ----- | ----------- |:---------------------------------------------------------------------------------------- | | OpID | OpAssigned | JobID | JobAssigned | Comment | | ---- | ---------- | ----- | ----------- |:---------------------------------------------------------------------------------------- | | 1 | | | | No job | | ---- | ---------- | ----- | ----------- |:---------------------------------------------------------------------------------------- | | 2 | Yes | | | No Job | | ---- | ---------- | ----- | ----------- |:---------------------------------------------------------------------------------------- | | 3 | | 1 | | Nothing is Assigned | | ---- | ---------- | ----- | ----------- |:---------------------------------------------------------------------------------------- | | 4 | | 2 | yes | Job is assigned, so can't use the operation | | ---- | ---------- | ----- | ----------- |:---------------------------------------------------------------------------------------- | | 5 | yes | 3 | | Op is assigned, so can't use it | | ---- | ---------- | ----- | ----------- |:---------------------------------------------------------------------------------------- | | 6 | yes | 4 | yes | That would be a problematic situation !! | | ---- | ---------- | ----- | ----------- |:---------------------------------------------------------------------------------------- | | 7 | | 5 | yes | Job 5 is assigned, so Op 7 cannot be used, even if Job6 is unassigned (this was the bug) | | | | 6 | | | | ---- | ---------- | ----- | ----------- |:---------------------------------------------------------------------------------------- | | 8 | yes | 7 | yes | Op8 is assigned, so can't be used (and is problematic like op6) | | | yes | 8 | | | | ---- | ---------- | ----- | ----------- |:---------------------------------------------------------------------------------------- | Under these circumstances, we want: * getNonFinishedOperation to return operations 1 and 3 * getActiveJobs to return jobs 1 and 6 """ # Utility to create a FT3File. # All operations must have at least one file associated # for the queries to make sense def _makeFile(): f = FTS3File() f.targetSE = "targetSE" return f # op1: Non assigned operation without any job op1 = FTS3TransferOperation() op1.operationID = 1 op1.ftsFiles.append(_makeFile()) # op2: assigned operation without any job op2 = FTS3TransferOperation() op2.operationID = 2 op2.ftsFiles.append(_makeFile()) # op3: Non assigned operation with one non assigned job op3 = FTS3TransferOperation() op3.operationID = 3 op3.ftsFiles.append(_makeFile()) j1 = FTS3Job() j1.jobID = 1 op3.ftsJobs.append(j1) # op4: Non assigned operation with one assigned job op4 = FTS3TransferOperation() op4.operationID = 4 op4.ftsFiles.append(_makeFile()) j2 = FTS3Job() j2.jobID = 2 op4.ftsJobs.append(j2) # op5: assigned operation with one non assigned job op5 = FTS3TransferOperation() op5.operationID = 5 op5.ftsFiles.append(_makeFile()) j3 = FTS3Job() j3.jobID = 3 op5.ftsJobs.append(j3) # op6: assigned operation with one assigned job # This is a very problematic case that we want # to avoid op6 = FTS3TransferOperation() op6.operationID = 6 op6.ftsFiles.append(_makeFile()) j4 = FTS3Job() j4.jobID = 4 op6.ftsJobs.append(j4) # op7: Non assigned operation with one assigned job and one non assigned job op7 = FTS3TransferOperation() op7.operationID = 7 op7.ftsFiles.append(_makeFile()) j5 = FTS3Job() j5.jobID = 5 op7.ftsJobs.append(j5) j6 = FTS3Job() op7.ftsFiles.append(_makeFile()) j6.jobID = 6 op7.ftsJobs.append(j6) # op8: assigned operation with one assigned job and one non assigned job # That is problematic, like op6 op8 = FTS3TransferOperation() op8.operationID = 8 j7 = FTS3Job() op8.ftsFiles.append(_makeFile()) j7.jobID = 7 op8.ftsJobs.append(j7) j8 = FTS3Job() j8.jobID = 8 op8.ftsJobs.append(j8) allOps = [op1, op2, op3, op4, op5, op6, op7, op8] for op in allOps: res = fts3db.persistOperation(op) assert res["OK"] with fts3db.engine.begin() as conn: conn.execute( update(FTS3DB.fts3JobTable).values(assignment="Yes").where( FTS3DB.fts3JobTable.c.jobID.in_([2, 4, 5, 7]))) with fts3db.engine.begin() as conn: conn.execute( update(FTS3DB.fts3OperationTable).values(assignment="Yes").where( FTS3DB.fts3OperationTable.c.operationID.in_([2, 5, 6, 8]))) res = fts3db.getNonFinishedOperations(operationAssignmentTag=None) assert res["OK"] nonFinishedOps = res["Value"] nonFinishedOpsIDs = [op.operationID for op in nonFinishedOps] assert nonFinishedOpsIDs == [1, 3] res = fts3db.getActiveJobs(jobAssignmentTag=None) assert res["OK"] activeJobs = res["Value"] activeJobIDs = [op.jobID for op in activeJobs] assert activeJobIDs == [1, 6]
def getFTS3Context(self, username, group, ftsServer, threadID): """ Returns an fts3 context for a given user, group and fts server The context pool is per thread, and there is one context per tuple (user, group, server). We dump the proxy of a user to a file (shared by all the threads), and use it to make the context. The proxy needs a lifetime of self.proxyLifetime, is cached for cacheTime = (2*lifeTime/3) - 10mn, and the lifetime of the context is 45mn The reason for cacheTime to be what it is is because the FTS3 server will ask for a new proxy after 2/3rd of the existing proxy has expired, so we renew it just before :param str username: name of the user :param str group: group of the user :param str ftsServer: address of the server :param str threadID: thread ID :returns: S_OK with the context object """ log = gLogger.getSubLogger("getFTS3Context", child=True) contextes = self._globalContextCache.setdefault(threadID, DictCache()) idTuple = (username, group, ftsServer) log.debug("Getting context for %s" % (idTuple, )) # We keep a context in the cache for 45 minutes # (so it needs to be valid at least 15 since we add it for one hour) if not contextes.exists(idTuple, 15 * 60): res = getDNForUsername(username) if not res['OK']: return res # We take the first DN returned userDN = res['Value'][0] log.debug("UserDN %s" % userDN) # We dump the proxy to a file. # It has to have a lifetime of self.proxyLifetime # Because the FTS3 servers cache it for 2/3rd of the lifetime # we should make our cache a bit less than 2/3rd of the lifetime cacheTime = int(2 * self.proxyLifetime / 3) - 600 res = gProxyManager.downloadVOMSProxyToFile( userDN, group, requiredTimeLeft=self.proxyLifetime, cacheTime=cacheTime) if not res['OK']: return res proxyFile = res['Value'] log.debug("Proxy file %s" % proxyFile) # We generate the context # In practice, the lifetime will be less than proxyLifetime # because we reuse a cached proxy. However, the cached proxy will # never forced a redelegation, because it is recent enough for FTS3 servers. # The delegation is forced when 2/3 rd of the lifetime are left, and we get a fresh # one just before. So no problem res = FTS3Job.generateContext(ftsServer, proxyFile, lifetime=self.proxyLifetime) if not res['OK']: return res context = res['Value'] # we add it to the cache for this thread for 1h contextes.add(idTuple, 3600, context) return S_OK(contextes.get(idTuple))
def test_05_cancelNotFoundJob(self): """When a job disappears from the server, we need to cancel it and its files. The scenario is as follow. Operation has 4 files. Job1 is submitted for File1 and File2. Job2 is submitted for File3 and File4. File1 is finished, and then the job disappears. We need to cancel Job1 and File2. Job2, File3 and File4 are here to make sure we do not cancel wrongly other files """ op = self.generateOperation("Transfer", 4, ["Target1"]) job1 = FTS3Job() job1GUID = "05-cancelall-job1" job1.ftsGUID = job1GUID job1.ftsServer = "fts3" job1.username = op.username job1.userGroup = op.userGroup # assign the GUID to the files op.ftsFiles[0].ftsGUID = job1GUID op.ftsFiles[1].ftsGUID = job1GUID # Pretend op.ftsJobs.append(job1) job2 = FTS3Job() job2GUID = "05-cancelall-job2" job2.ftsGUID = job2GUID job2.ftsServer = "fts3" job2.username = op.username job2.userGroup = op.userGroup # assign the GUID to the files op.ftsFiles[2].ftsGUID = job2GUID op.ftsFiles[3].ftsGUID = job2GUID op.ftsJobs.append(job2) res = self.db.persistOperation(op) opID = res["Value"] # Get back the operation to update all the IDs res = self.db.getOperation(opID) op = res["Value"] fileIds = [] for ftsFile in op.ftsFiles: fileIds.append(ftsFile.fileID) # Now we monitor Job1, and find that the first file has failed, the second is still ongoing # And since File1 is in an FTS final status, we set its ftsGUID to None file1ID = op.ftsFiles[0].fileID file2ID = op.ftsFiles[1].fileID fileStatusDict = { file1ID: { "status": "Finished", "ftsGUID": None }, file2ID: { "status": "Staging" } } # And when updating, take care of specifying that you are updating for a given GUID res = self.db.updateFileStatus(fileStatusDict, ftsGUID=job1GUID) self.assertTrue(res["OK"]) # Now we monitor again, job one, and find out that job1 has disappeared # So we cancel the job and the files res = self.db.cancelNonExistingJob(opID, job1GUID) self.assertTrue(res["OK"]) # And hopefully now File2 is Canceled, while the others are as they were res = self.client.getOperation(opID) op = res["Value"] self.assertTrue(op.ftsFiles[0].status == "Finished") self.assertTrue(op.ftsFiles[1].status == "Canceled") self.assertTrue(op.ftsFiles[1].ftsGUID is None) self.assertTrue(op.ftsFiles[2].status == "New") self.assertTrue(op.ftsFiles[3].status == "New")
def test_04_job_monitoring_solve_racecondition(self): """We used to have a race condition resulting in duplicated transfers for a file. This test reproduces the race condition to make sure it is fixed. This test makes sure that the update only happens on files concerned by the job The scenario is as follow. Operation has two files File1 and File2. Job1 is submitted for File1 and File2. File1 fails, File2 is still ongoing. We submit Job2 for File1. Job1 is monitored again, and we update again File1 to failed (because it is so in Job1) A Job3 would be created for File1, dispite Job2 still runing on it. """ op = self.generateOperation("Transfer", 2, ["Target1"]) job1 = FTS3Job() job1GUID = "04-racecondition-job1" job1.ftsGUID = job1GUID job1.ftsServer = "fts3" job1.username = op.username job1.userGroup = op.userGroup op.ftsJobs.append(job1) # Now, when submitting the job, we specify the ftsGUID to which files are # assigned for ftsFile in op.ftsFiles: ftsFile.ftsGUID = job1GUID res = self.client.persistOperation(op) opID = res["Value"] # Get back the operation to update all the IDs res = self.client.getOperation(opID) op = res["Value"] fileIds = [] for ftsFile in op.ftsFiles: fileIds.append(ftsFile.fileID) # Arbitrarilly decide that File1 has the smalled fileID file1ID = min(fileIds) file2ID = max(fileIds) # Now we monitor Job1, and find that the first file has failed, the second is still ongoing # And since File1 is in an FTS final status, we set its ftsGUID to None fileStatusDict = { file1ID: { "status": "Failed", "error": "Someone made a boo-boo", "ftsGUID": None }, file2ID: { "status": "Staging" }, } # And when updating, take care of specifying that you are updating for a given GUID res = self.db.updateFileStatus(fileStatusDict, ftsGUID=job1GUID) self.assertTrue(res["OK"]) # We would then submit a second job job2 = FTS3Job() job2GUID = "04-racecondition-job2" job2.ftsGUID = job2GUID job2.ftsServer = "fts3" job2.username = op.username job2.userGroup = op.userGroup op.ftsJobs.append(job2) # And do not forget to add the new FTSGUID to File1 # assigned for ftsFile in op.ftsFiles: if ftsFile.fileID == file1ID: ftsFile.ftsGUID = job2GUID res = self.client.persistOperation(op) # Now we monitor Job2 & Job1 (in this order) fileStatusDictJob2 = { file1ID: { "status": "Staging" }, } # Again specify the GUID res = self.db.updateFileStatus(fileStatusDictJob2, ftsGUID=job2GUID) self.assertTrue(res["OK"]) # And in Job1, File1 is (and will remain) failed, while File2 is still ongoing fileStatusDictJob1 = { file1ID: { "status": "Failed", "error": "Someone made a boo-boo" }, file2ID: { "status": "Staging" }, } # And thanks to specifying the job GUID, File1 should not be touched ! res = self.db.updateFileStatus(fileStatusDictJob1, ftsGUID=job1GUID) self.assertTrue(res["OK"]) # And hopefully now there shouldn't be any file to submit res = self.client.getOperation(opID) op = res["Value"] # isTotallyProcessed does not return S_OK struct filesToSubmit = op._getFilesToSubmit() self.assertEqual(filesToSubmit, [])
def test_03_job_monitoring_racecondition(self): """We used to have a race condition resulting in duplicated transfers for a file. This test reproduces the race condition. The scenario is as follow. Operation has two files File1 and File2. Job1 is submitted for File1 and File2. File1 fails, File2 is still ongoing. We submit Job2 for File1. Job1 is monitored again, and we update again File1 to failed (because it is so in Job1) A Job3 would be created for File1, despite Job2 still running on it. """ op = self.generateOperation("Transfer", 2, ["Target1"]) job1 = FTS3Job() job1.ftsGUID = "03-racecondition-job1" job1.ftsServer = "fts3" job1.username = op.username job1.userGroup = op.userGroup op.ftsJobs.append(job1) res = self.client.persistOperation(op) opID = res["Value"] # Get back the operation to update all the IDs res = self.client.getOperation(opID) op = res["Value"] fileIds = [] for ftsFile in op.ftsFiles: fileIds.append(ftsFile.fileID) file1ID = min(fileIds) file2ID = max(fileIds) # Now we monitor Job1, and find that the first file has failed, the second is still ongoing fileStatusDict = { file1ID: { "status": "Failed", "error": "Someone made a boo-boo" }, file2ID: { "status": "Staging" }, } res = self.db.updateFileStatus(fileStatusDict) self.assertTrue(res["OK"]) # We would then submit a second job job2 = FTS3Job() job2.ftsGUID = "03-racecondition-job2" job2.ftsServer = "fts3" job2.username = op.username job2.userGroup = op.userGroup op.ftsJobs.append(job2) res = self.client.persistOperation(op) # Now we monitor Job2 & Job1 (in this order) fileStatusDictJob2 = { file1ID: { "status": "Staging" }, } res = self.db.updateFileStatus(fileStatusDictJob2) self.assertTrue(res["OK"]) # And in Job1, File1 is (and will remain) failed, while File2 is still ongoing fileStatusDictJob1 = { file1ID: { "status": "Failed", "error": "Someone made a boo-boo" }, file2ID: { "status": "Staging" }, } res = self.db.updateFileStatus(fileStatusDictJob1) self.assertTrue(res["OK"]) # And now this is the problem, because If we check whether this operation still has # files to submit, it will tell me yes, while all the files are being taken care of res = self.client.getOperation(opID) op = res["Value"] # isTotallyProcessed does not return S_OK struct filesToSubmit = op._getFilesToSubmit() self.assertEqual(filesToSubmit, [op.ftsFiles[0]])