def down(job, inputFile, fileStart, fileEnd, N): """Input is a file and a range into that file to sort and an output location in which to write the sorted file. If the range is larger than a threshold N the range is divided recursively and a follow on job is then created which merges back the results else the file is sorted and placed in the output. """ if random.random() > success_ratio: raise RuntimeError() #This error is a test error, it does not mean the tests have failed. length = fileEnd - fileStart if length > N: #We will subdivide the file job.fileStore.logToMaster( "Splitting range (%i..%i) of file: %s" % (fileStart, fileEnd, inputFile) ) midPoint = getMidPoint(inputFile, fileStart, fileEnd) return job.addFollowOnJobFn(up, job.addChildJobFn(down, inputFile, fileStart, midPoint+1, N, memory=sortMemory).rv(), job.addChildJobFn(down, inputFile, midPoint+1, fileEnd, N, memory=sortMemory).rv()).rv() else: #We can sort this bit of the file job.fileStore.logToMaster( "Sorting range (%i..%i) of file: %s" % (fileStart, fileEnd, inputFile) ) t = job.fileStore.getLocalTempFile() with open(t, 'w') as fH: copySubRangeOfFile(inputFile, fileStart, fileEnd, fH) sort(t) return job.fileStore.writeGlobalFile(t)
def down(job, inputFileStoreID, N, downCheckpoints, memory=sortMemory): """Input is a file and a range into that file to sort and an output location in which to write the sorted file. If the range is larger than a threshold N the range is divided recursively and a follow on job is then created which merges back the results else the file is sorted and placed in the output. """ #Read the file inputFile = job.fileStore.readGlobalFile(inputFileStoreID, cache=False) length = os.path.getsize(inputFile) if length > N: #We will subdivide the file job.fileStore.logToMaster( "Splitting file: %s of size: %s" % (inputFileStoreID, length), level=logging.CRITICAL ) #Split the file into two copies midPoint = getMidPoint(inputFile, 0, length) t1 = job.fileStore.getLocalTempFile() with open(t1, 'w') as fH: copySubRangeOfFile(inputFile, 0, midPoint+1, fH) t2 = job.fileStore.getLocalTempFile() with open(t2, 'w') as fH: copySubRangeOfFile(inputFile, midPoint+1, length, fH) #Call down recursively return job.addFollowOnJobFn(up, job.addChildJobFn(down, job.fileStore.writeGlobalFile(t1), N, downCheckpoints, checkpoint=downCheckpoints, memory=sortMemory).rv(), job.addChildJobFn(down, job.fileStore.writeGlobalFile(t2), N, downCheckpoints, checkpoint=downCheckpoints, memory=sortMemory).rv()).rv() else: #We can sort this bit of the file job.fileStore.logToMaster( "Sorting file: %s of size: %s" % (inputFileStoreID, length), level=logging.CRITICAL ) #Sort the copy and write back to the fileStore sort(inputFile) return job.fileStore.writeGlobalFile(inputFile)
def down(job, inputFileStoreID, N, memory=sortMemory): """Input is a file and a range into that file to sort and an output location in which to write the sorted file. If the range is larger than a threshold N the range is divided recursively and a follow on job is then created which merges back the results else the file is sorted and placed in the output. """ #Read the file inputFile = job.fileStore.readGlobalFile(inputFileStoreID, cache=False) length = os.path.getsize(inputFile) if length > N: #We will subdivide the file job.fileStore.logToMaster( "Splitting file: %s of size: %s" % (inputFileStoreID, length), level=logging.CRITICAL ) #Split the file into two copies midPoint = getMidPoint(inputFile, 0, length) t1 = job.fileStore.getLocalTempFile() with open(t1, 'w') as fH: copySubRangeOfFile(inputFile, 0, midPoint+1, fH) t2 = job.fileStore.getLocalTempFile() with open(t2, 'w') as fH: copySubRangeOfFile(inputFile, midPoint+1, length, fH) #Call down recursively return job.addFollowOnJobFn(up, job.addChildJobFn(down, job.fileStore.writeGlobalFile(t1), N, memory=sortMemory).rv(), job.addChildJobFn(down, job.fileStore.writeGlobalFile(t2), N, memory=sortMemory).rv()).rv() else: #We can sort this bit of the file job.fileStore.logToMaster( "Sorting file: %s of size: %s" % (inputFileStoreID, length), level=logging.CRITICAL ) #Sort the copy and write back to the fileStore sort(inputFile) return job.fileStore.writeGlobalFile(inputFile)
def testCopySubRangeOfFile(self): for test in xrange(self.testNo): tempFile = os.path.join(self.tempDir, "fileToSort1.txt") outputFile = os.path.join(self.tempDir, "outputFileToSort1.txt") makeFileToSort(tempFile) fileSize = os.path.getsize(tempFile) assert fileSize > 0 fileStart = random.choice(xrange(0, fileSize)) fileEnd = random.choice(xrange(fileStart, fileSize)) fileHandle = open(outputFile, 'w') copySubRangeOfFile(tempFile, fileStart, fileEnd, fileHandle) fileHandle.close() l = open(outputFile, 'r').read() l2 = open(tempFile, 'r').read()[fileStart:fileEnd] checkEqual(l, l2)
def testCopySubRangeOfFile(self): for test in xrange(self.testNo): tempFile = os.path.join(self.tempDir, "fileToSort1.txt") outputFile = os.path.join(self.tempDir, "outputFileToSort1.txt") makeFileToSort(tempFile, lines=10, lineLen=defaultLineLen) fileSize = os.path.getsize(tempFile) assert fileSize > 0 fileStart = random.choice(xrange(0, fileSize)) fileEnd = random.choice(xrange(fileStart, fileSize)) fileHandle = open(outputFile, 'w') copySubRangeOfFile(tempFile, fileStart, fileEnd, fileHandle) fileHandle.close() l = open(outputFile, 'r').read() l2 = open(tempFile, 'r').read()[fileStart:fileEnd] self.assertEquals(l, l2)
def down(job, inputFile, fileStart, fileEnd, N, outputFileStoreID): """Input is a file and a range into that file to sort and an output location in which to write the sorted file. If the range is larger than a threshold N the range is divided recursively and a follow on batchjob is then created which merges back the results else the file is sorted and placed in the output. """ if random.random() > success_ratio: raise RuntimeError( ) #This error is a test error, it does not mean the tests have failed. length = fileEnd - fileStart assert length >= 0 if length > N: job.fileStore.logToMaster("Splitting range (%i..%i) of file: %s" % (fileStart, fileEnd, inputFile)) midPoint = getMidPoint(inputFile, fileStart, fileEnd) assert midPoint >= fileStart assert midPoint + 1 < fileEnd #We will subdivide the file tempFileStoreID1 = job.fileStore.getEmptyFileStoreID() tempFileStoreID2 = job.fileStore.getEmptyFileStoreID() #The use of rv here is for testing purposes #The rv(0) of the first child job is tempFileStoreID1, #similarly rv(0) of the second child is tempFileStoreID2 job.addFollowOnJobFn( up, job.addChildJobFn(down, inputFile, fileStart, midPoint + 1, N, tempFileStoreID1).rv(0), job.addChildJobFn( down, inputFile, midPoint + 1, fileEnd, N, tempFileStoreID2).rv(0), #Add one to avoid the newline outputFileStoreID) else: #We can sort this bit of the file job.fileStore.logToMaster("Sorting range (%i..%i) of file: %s" % (fileStart, fileEnd, inputFile)) with job.fileStore.updateGlobalFileStream( outputFileStoreID) as fileHandle: copySubRangeOfFile(inputFile, fileStart, fileEnd, fileHandle) #Make a local copy and sort the file tempOutputFile = job.fileStore.readGlobalFile(outputFileStoreID) sort(tempOutputFile) job.fileStore.updateGlobalFile(outputFileStoreID, tempOutputFile) return outputFileStoreID
def down(job, inputFile, fileStart, fileEnd, N, outputFileStoreID): """Input is a file and a range into that file to sort and an output location in which to write the sorted file. If the range is larger than a threshold N the range is divided recursively and a follow on batchjob is then created which merges back the results else the file is sorted and placed in the output. """ if random.random() > success_ratio: raise RuntimeError() #This error is a test error, it does not mean the tests have failed. length = fileEnd - fileStart assert length >= 0 if length > N: job.fileStore.logToMaster( "Splitting range (%i..%i) of file: %s" % (fileStart, fileEnd, inputFile) ) midPoint = getMidPoint(inputFile, fileStart, fileEnd) assert midPoint >= fileStart assert midPoint+1 < fileEnd #We will subdivide the file tempFileStoreID1 = job.fileStore.getEmptyFileStoreID() tempFileStoreID2 = job.fileStore.getEmptyFileStoreID() #The use of rv here is for testing purposes #The rv(0) of the first child job is tempFileStoreID1, #similarly rv(0) of the second child is tempFileStoreID2 job.addFollowOnJobFn(up, job.addChildJobFn(down, inputFile, fileStart, midPoint+1, N, tempFileStoreID1).rv(0), job.addChildJobFn(down, inputFile, midPoint+1, fileEnd, N, tempFileStoreID2).rv(0), #Add one to avoid the newline outputFileStoreID) else: #We can sort this bit of the file job.fileStore.logToMaster( "Sorting range (%i..%i) of file: %s" % (fileStart, fileEnd, inputFile) ) with job.fileStore.updateGlobalFileStream(outputFileStoreID) as fileHandle: copySubRangeOfFile(inputFile, fileStart, fileEnd, fileHandle) #Make a local copy and sort the file tempOutputFile = job.fileStore.readGlobalFile(outputFileStoreID) sort(tempOutputFile) job.fileStore.updateGlobalFile(outputFileStoreID, tempOutputFile) return outputFileStoreID