示例#1
0
def expectationMaximisation(target, sequences, alignments, outputModel, options):
    #Iteratively run cPecanRealign to get expectations and load model.
    if options.inputModel != None: #Read in the model
        target.logToMaster("Loading the model from the input file %s" % options.inputModel)
        hmm = Hmm.loadHmm(options.inputModel)
        target.logToMaster("Loaded the model, has type %s" % hmm.modelType)
        hmm.normalise()
    else:
        target.logToMaster("Making model of type %s" % options.modelType)
        hmm = Hmm(options.modelType)
        if options.randomStart: #Make random parameters
            target.logToMaster("Using random starting parameters")
            hmm.randomise()
        else:
            hmm.equalise()
    if options.setJukesCantorStartingEmissions != None:
        hmm.setEmissionsToJukesCantor(float(options.setJukesCantorStartingEmissions))
    
    #Write out the first version of the output model
    hmm.write(outputModel)

    #Make a set of split alignment files
    alignmentsLength = 0
    splitAlignmentFiles = []
    fH = None
    for cigar in cigarRead(alignments):
        if fH == None:
            splitAlignmentFiles.append(os.path.join(target.getGlobalTempDir(), "alignments_%s.cigar" % len(splitAlignmentFiles)))
            fH = open(splitAlignmentFiles[-1], 'w')
        alignmentsLength += (abs(cigar.start1 - cigar.end1) + abs(cigar.start2 - cigar.end2))/2.0
        cigarWrite(fH, cigar)
        if alignmentsLength > options.maxAlignmentLengthPerJob:
            fH.close()
            fH = None
            splitAlignmentFiles[-1] = (splitAlignmentFiles[-1], alignmentsLength)
            alignmentsLength = 0
    if fH != None:
        fH.close()
        splitAlignmentFiles[-1] = (splitAlignmentFiles[-1], alignmentsLength)
    
    #Sample the alignment files so that we do EM on no more than options.maxAlignmentLengthToSample bases
    random.shuffle(splitAlignmentFiles) #This ensures we don't just take the first N alignments in the provided alignments file
    sampledSplitAlignmentFiles = []
    totalSampledAlignmentLength = 0.0
    for alignmentsFile, alignmentsLength in splitAlignmentFiles:
        totalSampledAlignmentLength += alignmentsLength
        sampledSplitAlignmentFiles.append(alignmentsFile)
        if totalSampledAlignmentLength >= options.maxAlignmentLengthToSample:
            break
    target.logToMaster("We sampled: %s bases of alignment length and %s alignment files, of a possible %s base and %s files" % \
                       (totalSampledAlignmentLength, len(sampledSplitAlignmentFiles), sum(map(lambda x : x[1], splitAlignmentFiles)), len(splitAlignmentFiles)))
    splitAlignmentFiles = sampledSplitAlignmentFiles

    #Files to store expectations in
    expectationsFiles = map(lambda i : os.path.join(target.getGlobalTempDir(), "expectation_%i.txt" % i), xrange(len(splitAlignmentFiles)))
    assert len(splitAlignmentFiles) == len(expectationsFiles)

    target.setFollowOnTargetFn(expectationMaximisation2, args=(sequences, splitAlignmentFiles, outputModel, expectationsFiles, 0, [], options))
示例#2
0
def upconvertCoords(cigarPath, fastaPath, contigNum, outputFile):
    """Convert the coordinates of the given alignment, so that the
    alignment refers to a set of trimmed sequences originating from a
    contig rather than to the contig itself."""
    with open(fastaPath) as f:
        seqRanges = getSequenceRanges(f)
    validateRanges(seqRanges)
    sortedCigarPath = sortCigarByContigAndPos(cigarPath, contigNum)
    sortedCigarFile = open(sortedCigarPath)

    currentContig = None
    currentRangeIdx = None
    currentRange = None
    for alignment in cigarRead(sortedCigarFile):
        # contig1 and contig2 are reversed in python api!!
        contig = alignment.contig2 if contigNum == 1 else alignment.contig1
        minPos = min(alignment.start2,
                     alignment.end2) if contigNum == 1 else min(
                         alignment.start1, alignment.end1)
        maxPos = max(alignment.start2,
                     alignment.end2) if contigNum == 1 else max(
                         alignment.start1, alignment.end1)
        if contig in seqRanges:
            if contig != currentContig:
                currentContig = contig
                currentRangeIdx = 0
                currentRange = seqRanges[contig][0]
            while (minPos >= currentRange[1] or minPos < currentRange[0]
                   ) and currentRangeIdx < len(seqRanges[contig]) - 1:
                currentRangeIdx += 1
                currentRange = seqRanges[contig][currentRangeIdx]
            if currentRange[0] <= minPos < currentRange[1]:
                if maxPos - 1 > currentRange[1]:
                    raise RuntimeError("alignment on %s:%d-%d crosses "
                                       "trimmed sequence boundary" %\
                                       (contig,
                                        minPos,
                                        maxPos))
                if contigNum == 1:
                    alignment.start2 -= currentRange[0]
                    alignment.end2 -= currentRange[0]
                    alignment.contig2 = contig + ("|%d" % currentRange[0])
                else:
                    alignment.start1 -= currentRange[0]
                    alignment.end1 -= currentRange[0]
                    alignment.contig1 = contig + ("|%d" % currentRange[0])
            else:
                raise RuntimeError("No trimmed sequence containing alignment "
                                   "on %s:%d-%d" % (contig, minPos, maxPos))
        cigarWrite(outputFile, alignment, False)
    os.remove(sortedCigarPath)
示例#3
0
def loadResults(resultsFile):
    """Puts the results in a set.
    """
    pairsSet = set()
    fileHandle = open(resultsFile, 'r')
    totalHits = 0
    for pairwiseAlignment in cigarRead(fileHandle):
        totalHits += 1
        i = pairwiseAlignment.start1
        s1 = 1
        if not pairwiseAlignment.strand1:
            i -= 1
            s1 = -1

        j = pairwiseAlignment.start2
        s2 = 1
        if not pairwiseAlignment.strand2:
            j -= 1
            s2 = -1

        for operation in pairwiseAlignment.operationList:
            if operation.type == PairwiseAlignment.PAIRWISE_INDEL_X:
                i += operation.length * s1
            elif operation.type == PairwiseAlignment.PAIRWISE_INDEL_Y:
                j += operation.length * s2
            else:
                assert operation.type == PairwiseAlignment.PAIRWISE_MATCH
                for k in xrange(operation.length):
                    if pairwiseAlignment.contig1 <= pairwiseAlignment.contig2:
                        if pairwiseAlignment.contig1 != pairwiseAlignment.contig2 or i != j:  #Avoid self alignments
                            pairsSet.add((pairwiseAlignment.contig1, i,
                                          pairwiseAlignment.contig2, j))
                    else:
                        pairsSet.add((pairwiseAlignment.contig2, j,
                                      pairwiseAlignment.contig1, i))
                    i += s1
                    j += s2

        if pairwiseAlignment.strand1:
            assert i == pairwiseAlignment.end1
        else:
            assert i == pairwiseAlignment.end1 - 1

        if pairwiseAlignment.strand2:
            assert j == pairwiseAlignment.end2
        else:
            assert j == pairwiseAlignment.end2 - 1

        #assert j == pairwiseAlignment.end2
    fileHandle.close()
    return (pairsSet, totalHits)
示例#4
0
def loadResults(resultsFile):  
    """Puts the results in a set.
    """
    pairsSet = set()
    fileHandle = open(resultsFile, 'r')
    totalHits = 0
    for pairwiseAlignment in cigarRead(fileHandle):
        totalHits +=1
        i = pairwiseAlignment.start1
        s1 = 1
        if not pairwiseAlignment.strand1:
            i -= 1
            s1 = -1
            
        j = pairwiseAlignment.start2
        s2 = 1
        if not pairwiseAlignment.strand2:
            j -= 1
            s2 = -1
        
        for operation in pairwiseAlignment.operationList:
            if operation.type == PairwiseAlignment.PAIRWISE_INDEL_X:
                i += operation.length * s1
            elif operation.type == PairwiseAlignment.PAIRWISE_INDEL_Y:
                j += operation.length * s2
            else:
                assert operation.type == PairwiseAlignment.PAIRWISE_MATCH
                for k in xrange(operation.length):
                    if pairwiseAlignment.contig1 <= pairwiseAlignment.contig2:
                        if pairwiseAlignment.contig1 != pairwiseAlignment.contig2 or i != j: #Avoid self alignments
                            pairsSet.add((pairwiseAlignment.contig1, i, pairwiseAlignment.contig2, j)) 
                    else:
                        pairsSet.add((pairwiseAlignment.contig2, j, pairwiseAlignment.contig1, i))
                    i += s1
                    j += s2
        
        if pairwiseAlignment.strand1:
            assert i == pairwiseAlignment.end1
        else:
            assert i == pairwiseAlignment.end1-1
        
        if pairwiseAlignment.strand2:
            assert j == pairwiseAlignment.end2
        else:
            assert j == pairwiseAlignment.end2-1
            
        #assert j == pairwiseAlignment.end2
    fileHandle.close()      
    return (pairsSet, totalHits)
def upconvertCoords(cigarPath, fastaPath, contigNum, outputFile):
    """Convert the coordinates of the given alignment, so that the
    alignment refers to a set of trimmed sequences originating from a
    contig rather than to the contig itself."""
    with open(fastaPath) as f:
        seqRanges = getSequenceRanges(f)
    validateRanges(seqRanges)
    sortedCigarPath = sortCigarByContigAndPos(cigarPath, contigNum)
    sortedCigarFile = open(sortedCigarPath)

    currentContig = None
    currentRangeIdx = None
    currentRange = None
    for alignment in cigarRead(sortedCigarFile):
        # contig1 and contig2 are reversed in python api!!
        contig = alignment.contig2 if contigNum == 1 else alignment.contig1
        minPos = min(alignment.start2, alignment.end2) if contigNum == 1 else min(alignment.start1, alignment.end1)
        maxPos = max(alignment.start2, alignment.end2) if contigNum == 1 else max(alignment.start1, alignment.end1)
        if contig in seqRanges:
            if contig != currentContig:
                currentContig = contig
                currentRangeIdx = 0
                currentRange = seqRanges[contig][0]
            while (minPos >= currentRange[1] or minPos < currentRange[0]) and currentRangeIdx < len(seqRanges[contig]) - 1:
                currentRangeIdx += 1
                currentRange = seqRanges[contig][currentRangeIdx]
            if currentRange[0] <= minPos < currentRange[1]:
                if maxPos - 1 > currentRange[1]:
                    raise RuntimeError("alignment on %s:%d-%d crosses "
                                       "trimmed sequence boundary" %\
                                       (contig,
                                        minPos,
                                        maxPos))
                if contigNum == 1:
                    alignment.start2 -= currentRange[0]
                    alignment.end2 -= currentRange[0]
                    alignment.contig2 = contig + ("|%d" % currentRange[0])
                else:
                    alignment.start1 -= currentRange[0]
                    alignment.end1 -= currentRange[0]
                    alignment.contig1 = contig + ("|%d" % currentRange[0])
            else:
                raise RuntimeError("No trimmed sequence containing alignment "
                                   "on %s:%d-%d" % (contig,
                                                    minPos,
                                                    maxPos))
        cigarWrite(outputFile, alignment, False)
    os.remove(sortedCigarPath)
示例#6
0
 def testCigarReadWrite(self):
     tempFile = getTempFile()
     self.tempFiles.append(tempFile)
     for test in range(0, self.testNo):
         cigarNumber = random.choice(range(10))
         l = [getRandomPairwiseAlignment() for i in range(cigarNumber)]
         fileHandle = open(tempFile, 'w')
         for cigar in l:
             cigarWrite(fileHandle, cigar)
         fileHandle.close()
         fileHandle = open(tempFile, 'r')
         l.reverse()
         for cigar in cigarRead(fileHandle):
             cigarWrite(sys.stdout, l[-1])
             cigarWrite(sys.stdout, cigar)
             assert cigar == l.pop()
         assert len(l) == 0
         fileHandle.close()
示例#7
0
    def testCigarReadWrite(self):
        """Tests the C code for reading and writing cigars against the python parser for cigars.
        """
        tempFile = getTempFile()
        self.tempFiles.append(tempFile)
        for test in range(0, self.testNo):
            pairwiseAlignmentNumber = random.choice(range(10))
            l = [
                getRandomPairwiseAlignment()
                for i in range(pairwiseAlignmentNumber)
            ]
            fileHandle = open(tempFile, 'w')

            keepProbs = random.random() > 0.5
            if keepProbs == False:
                for pA in l:
                    for op in pA.operationList:
                        op.score = 0.0

            for pairwiseAlignment in l:
                cigarWrite(fileHandle, pairwiseAlignment, keepProbs)
            fileHandle.close()

            #Now call sonLib_cigarsTest and read and write chains
            command = "sonLib_cigarTest %s %s" % (tempFile, keepProbs)
            #return
            system(command)

            #Now check the chain is okay
            fileHandle = open(tempFile, 'r')
            l.reverse()

            for pairwiseAlignment in cigarRead(fileHandle):
                pairwiseAlignment2 = l.pop()
                cigarWrite(sys.stdout, pairwiseAlignment, keepProbs)
                cigarWrite(sys.stdout, pairwiseAlignment2, keepProbs)
                assert pairwiseAlignment == pairwiseAlignment2
            assert len(l) == 0
            fileHandle.close()
示例#8
0
def expectationMaximisation(target, sequences, alignments, outputModel,
                            options):
    #Iteratively run cPecanRealign to get expectations and load model.
    if options.inputModel != None:  #Read in the model
        target.logToMaster("Loading the model from the input file %s" %
                           options.inputModel)
        hmm = Hmm.loadHmm(options.inputModel)
        target.logToMaster("Loaded the model, has type %s" % hmm.modelType)
        hmm.normalise()
    else:
        target.logToMaster("Making model of type %s" % options.modelType)
        hmm = Hmm(options.modelType)
        if options.randomStart:  #Make random parameters
            target.logToMaster("Using random starting parameters")
            hmm.randomise()
        else:
            hmm.equalise()
    if options.setJukesCantorStartingEmissions != None:
        hmm.setEmissionsToJukesCantor(
            float(options.setJukesCantorStartingEmissions))

    #Write out the first version of the output model
    hmm.write(outputModel)

    #Make a set of split alignment files
    alignmentsLength = 0
    splitAlignmentFiles = []
    fH = None
    for cigar in cigarRead(alignments):
        if fH == None:
            splitAlignmentFiles.append(
                os.path.join(target.getGlobalTempDir(),
                             "alignments_%s.cigar" % len(splitAlignmentFiles)))
            fH = open(splitAlignmentFiles[-1], 'w')
        alignmentsLength += (abs(cigar.start1 - cigar.end1) +
                             abs(cigar.start2 - cigar.end2)) / 2.0
        cigarWrite(fH, cigar)
        if alignmentsLength > options.maxAlignmentLengthPerJob:
            fH.close()
            fH = None
            splitAlignmentFiles[-1] = (splitAlignmentFiles[-1],
                                       alignmentsLength)
            alignmentsLength = 0
    if fH != None:
        fH.close()
        splitAlignmentFiles[-1] = (splitAlignmentFiles[-1], alignmentsLength)

    #Sample the alignment files so that we do EM on no more than options.maxAlignmentLengthToSample bases
    random.shuffle(
        splitAlignmentFiles
    )  #This ensures we don't just take the first N alignments in the provided alignments file
    sampledSplitAlignmentFiles = []
    totalSampledAlignmentLength = 0.0
    for alignmentsFile, alignmentsLength in splitAlignmentFiles:
        totalSampledAlignmentLength += alignmentsLength
        sampledSplitAlignmentFiles.append(alignmentsFile)
        if totalSampledAlignmentLength >= options.maxAlignmentLengthToSample:
            break
    target.logToMaster("We sampled: %s bases of alignment length and %s alignment files, of a possible %s base and %s files" % \
                       (totalSampledAlignmentLength, len(sampledSplitAlignmentFiles), sum(map(lambda x : x[1], splitAlignmentFiles)), len(splitAlignmentFiles)))
    splitAlignmentFiles = sampledSplitAlignmentFiles

    #Files to store expectations in
    expectationsFiles = map(
        lambda i: os.path.join(target.getGlobalTempDir(), "expectation_%i.txt"
                               % i), xrange(len(splitAlignmentFiles)))
    assert len(splitAlignmentFiles) == len(expectationsFiles)

    target.setFollowOnTargetFn(expectationMaximisation2,
                               args=(sequences, splitAlignmentFiles,
                                     outputModel, expectationsFiles, 0, [],
                                     options))