示例#1
0
文件: sort.py 项目: adamnovak/toil
def down(job, inputFile, fileStart, fileEnd, N):
    """Input is a file and a range into that file to sort and an output location in which
    to write the sorted file.
    If the range is larger than a threshold N the range is divided recursively and
    a follow on job is then created which merges back the results else
    the file is sorted and placed in the output.
    """
    if random.random() > success_ratio:
        raise RuntimeError() #This error is a test error, it does not mean the tests have failed.
    length = fileEnd - fileStart
    if length > N:
        #We will subdivide the file
        job.fileStore.logToMaster( "Splitting range (%i..%i) of file: %s"
                                      % (fileStart, fileEnd, inputFile) )
        midPoint = getMidPoint(inputFile, fileStart, fileEnd)
        return job.addFollowOnJobFn(up,
            job.addChildJobFn(down, inputFile, fileStart, midPoint+1, N, memory=sortMemory).rv(),
            job.addChildJobFn(down, inputFile, midPoint+1, fileEnd, N, memory=sortMemory).rv()).rv()          
    else:
        #We can sort this bit of the file
        job.fileStore.logToMaster( "Sorting range (%i..%i) of file: %s"
                                      % (fileStart, fileEnd, inputFile) )
        t = job.fileStore.getLocalTempFile()
        with open(t, 'w') as fH:
            copySubRangeOfFile(inputFile, fileStart, fileEnd, fH)
        sort(t)
        return job.fileStore.writeGlobalFile(t)
示例#2
0
def down(job, inputFileStoreID, N, downCheckpoints, memory=sortMemory):
    """Input is a file and a range into that file to sort and an output location in which
    to write the sorted file.
    If the range is larger than a threshold N the range is divided recursively and
    a follow on job is then created which merges back the results else
    the file is sorted and placed in the output.
    """
    #Read the file
    inputFile = job.fileStore.readGlobalFile(inputFileStoreID, cache=False)
    length = os.path.getsize(inputFile)
    if length > N:
        #We will subdivide the file
        job.fileStore.logToMaster( "Splitting file: %s of size: %s"
                                      % (inputFileStoreID, length), level=logging.CRITICAL )
        #Split the file into two copies
        midPoint = getMidPoint(inputFile, 0, length)
        t1 = job.fileStore.getLocalTempFile()
        with open(t1, 'w') as fH:
            copySubRangeOfFile(inputFile, 0, midPoint+1, fH)
        t2 = job.fileStore.getLocalTempFile()
        with open(t2, 'w') as fH:
            copySubRangeOfFile(inputFile, midPoint+1, length, fH)
        #Call down recursively
        return job.addFollowOnJobFn(up,
            job.addChildJobFn(down, job.fileStore.writeGlobalFile(t1), N, 
                              downCheckpoints, checkpoint=downCheckpoints, memory=sortMemory).rv(),
            job.addChildJobFn(down, job.fileStore.writeGlobalFile(t2), N, 
                              downCheckpoints, checkpoint=downCheckpoints, memory=sortMemory).rv()).rv()          
    else:
        #We can sort this bit of the file
        job.fileStore.logToMaster( "Sorting file: %s of size: %s"
                                      % (inputFileStoreID, length), level=logging.CRITICAL )
        #Sort the copy and write back to the fileStore
        sort(inputFile)
        return job.fileStore.writeGlobalFile(inputFile)
示例#3
0
文件: sort.py 项目: arkal/toil
def down(job, inputFileStoreID, N, memory=sortMemory):
    """Input is a file and a range into that file to sort and an output location in which
    to write the sorted file.
    If the range is larger than a threshold N the range is divided recursively and
    a follow on job is then created which merges back the results else
    the file is sorted and placed in the output.
    """
    #Read the file
    inputFile = job.fileStore.readGlobalFile(inputFileStoreID, cache=False)
    length = os.path.getsize(inputFile)
    if length > N:
        #We will subdivide the file
        job.fileStore.logToMaster( "Splitting file: %s of size: %s"
                                      % (inputFileStoreID, length), level=logging.CRITICAL )
        #Split the file into two copies
        midPoint = getMidPoint(inputFile, 0, length)
        t1 = job.fileStore.getLocalTempFile()
        with open(t1, 'w') as fH:
            copySubRangeOfFile(inputFile, 0, midPoint+1, fH)
        t2 = job.fileStore.getLocalTempFile()
        with open(t2, 'w') as fH:
            copySubRangeOfFile(inputFile, midPoint+1, length, fH)
        #Call down recursively
        return job.addFollowOnJobFn(up,
            job.addChildJobFn(down, job.fileStore.writeGlobalFile(t1), N, memory=sortMemory).rv(),
            job.addChildJobFn(down, job.fileStore.writeGlobalFile(t2), N, memory=sortMemory).rv()).rv()          
    else:
        #We can sort this bit of the file
        job.fileStore.logToMaster( "Sorting file: %s of size: %s"
                                      % (inputFileStoreID, length), level=logging.CRITICAL )
        #Sort the copy and write back to the fileStore
        sort(inputFile)
        return job.fileStore.writeGlobalFile(inputFile)
示例#4
0
 def testCopySubRangeOfFile(self):
     for test in xrange(self.testNo):
         tempFile = os.path.join(self.tempDir, "fileToSort1.txt")
         outputFile = os.path.join(self.tempDir, "outputFileToSort1.txt")
         makeFileToSort(tempFile)
         fileSize = os.path.getsize(tempFile)
         assert fileSize > 0
         fileStart = random.choice(xrange(0, fileSize))
         fileEnd = random.choice(xrange(fileStart, fileSize))
         fileHandle = open(outputFile, 'w')
         copySubRangeOfFile(tempFile, fileStart, fileEnd, fileHandle)
         fileHandle.close()
         l = open(outputFile, 'r').read()
         l2 = open(tempFile, 'r').read()[fileStart:fileEnd]
         checkEqual(l, l2)
示例#5
0
 def testCopySubRangeOfFile(self):
     for test in xrange(self.testNo):
         tempFile = os.path.join(self.tempDir, "fileToSort1.txt")
         outputFile = os.path.join(self.tempDir, "outputFileToSort1.txt")
         makeFileToSort(tempFile, lines=10, lineLen=defaultLineLen)
         fileSize = os.path.getsize(tempFile)
         assert fileSize > 0
         fileStart = random.choice(xrange(0, fileSize))
         fileEnd = random.choice(xrange(fileStart, fileSize))
         fileHandle = open(outputFile, 'w')
         copySubRangeOfFile(tempFile, fileStart, fileEnd, fileHandle)
         fileHandle.close()
         l = open(outputFile, 'r').read()
         l2 = open(tempFile, 'r').read()[fileStart:fileEnd]
         self.assertEquals(l, l2)
示例#6
0
def down(job, inputFile, fileStart, fileEnd, N, outputFileStoreID):
    """Input is a file and a range into that file to sort and an output location in which
    to write the sorted file.
    If the range is larger than a threshold N the range is divided recursively and
    a follow on batchjob is then created which merges back the results else
    the file is sorted and placed in the output.
    """
    if random.random() > success_ratio:
        raise RuntimeError(
        )  #This error is a test error, it does not mean the tests have failed.
    length = fileEnd - fileStart
    assert length >= 0
    if length > N:
        job.fileStore.logToMaster("Splitting range (%i..%i) of file: %s" %
                                  (fileStart, fileEnd, inputFile))
        midPoint = getMidPoint(inputFile, fileStart, fileEnd)
        assert midPoint >= fileStart
        assert midPoint + 1 < fileEnd
        #We will subdivide the file
        tempFileStoreID1 = job.fileStore.getEmptyFileStoreID()
        tempFileStoreID2 = job.fileStore.getEmptyFileStoreID()
        #The use of rv here is for testing purposes
        #The rv(0) of the first child job is tempFileStoreID1,
        #similarly rv(0) of the second child is tempFileStoreID2
        job.addFollowOnJobFn(
            up,
            job.addChildJobFn(down, inputFile, fileStart, midPoint + 1, N,
                              tempFileStoreID1).rv(0),
            job.addChildJobFn(
                down, inputFile, midPoint + 1, fileEnd, N,
                tempFileStoreID2).rv(0),  #Add one to avoid the newline
            outputFileStoreID)
    else:
        #We can sort this bit of the file
        job.fileStore.logToMaster("Sorting range (%i..%i) of file: %s" %
                                  (fileStart, fileEnd, inputFile))
        with job.fileStore.updateGlobalFileStream(
                outputFileStoreID) as fileHandle:
            copySubRangeOfFile(inputFile, fileStart, fileEnd, fileHandle)
        #Make a local copy and sort the file
        tempOutputFile = job.fileStore.readGlobalFile(outputFileStoreID)
        sort(tempOutputFile)
        job.fileStore.updateGlobalFile(outputFileStoreID, tempOutputFile)
    return outputFileStoreID
示例#7
0
def down(job, inputFile, fileStart, fileEnd, N, outputFileStoreID):
    """Input is a file and a range into that file to sort and an output location in which
    to write the sorted file.
    If the range is larger than a threshold N the range is divided recursively and
    a follow on batchjob is then created which merges back the results else
    the file is sorted and placed in the output.
    """
    if random.random() > success_ratio:
        raise RuntimeError() #This error is a test error, it does not mean the tests have failed.
    length = fileEnd - fileStart
    assert length >= 0
    if length > N:
        job.fileStore.logToMaster( "Splitting range (%i..%i) of file: %s"
                                      % (fileStart, fileEnd, inputFile) )
        midPoint = getMidPoint(inputFile, fileStart, fileEnd)
        assert midPoint >= fileStart
        assert midPoint+1 < fileEnd
        #We will subdivide the file
        tempFileStoreID1 = job.fileStore.getEmptyFileStoreID()
        tempFileStoreID2 = job.fileStore.getEmptyFileStoreID()
        #The use of rv here is for testing purposes
        #The rv(0) of the first child job is tempFileStoreID1,
        #similarly rv(0) of the second child is tempFileStoreID2
        job.addFollowOnJobFn(up,
                                   job.addChildJobFn(down, inputFile, fileStart,
                                                           midPoint+1, N, tempFileStoreID1).rv(0),
                                   job.addChildJobFn(down, inputFile, midPoint+1,
                                                           fileEnd, N, tempFileStoreID2).rv(0), #Add one to avoid the newline
                                   outputFileStoreID)                
    else:
        #We can sort this bit of the file
        job.fileStore.logToMaster( "Sorting range (%i..%i) of file: %s"
                                      % (fileStart, fileEnd, inputFile) )
        with job.fileStore.updateGlobalFileStream(outputFileStoreID) as fileHandle:
            copySubRangeOfFile(inputFile, fileStart, fileEnd, fileHandle)
        #Make a local copy and sort the file
        tempOutputFile = job.fileStore.readGlobalFile(outputFileStoreID)
        sort(tempOutputFile)
        job.fileStore.updateGlobalFile(outputFileStoreID, tempOutputFile)
    return outputFileStoreID