Exemplo n.º 1
0
def learnModelFromSamFileTargetFn(target, samFile, readFastqFile, 
                                  referenceFastaFile, options):
    """Does expectation maximisation on sam file to learn the hmm for the sam file.
    """
    #Get cigars and reads fasta file
    cigars = os.path.join(target.getGlobalTempDir(), "temp.cigar")
    fHCigars = open(cigars, 'w')
    reads = os.path.join(target.getGlobalTempDir(), "temp.fa")
    fHReads = open(reads, 'w')
    sam = pysam.Samfile(samFile, "r" )
    for aR, counter in zip(sam, xrange(sys.maxint)): #Iterate on the sam lines realigning them in parallel            
        aR.query_name = aR.query_name + "_%s" % counter
        fHCigars.write(getExonerateCigarFormatString(aR, sam) + "\n")
        fastaWrite(fHReads, aR.query_name, aR.seq)
    fHCigars.close(); fHReads.close()
    
    unnormalisedOutputModel = os.path.join(target.getGlobalTempDir(), 
                                           "unnormalisedOutputModel.hmm")
    target.addChildTargetFn(cPecanEm.expectationMaximisationTrials, 
                            args=(" ".join([reads, referenceFastaFile ]), cigars, 
                                  unnormalisedOutputModel, options))
    
    #Now set up normalisation
    target.setFollowOnTargetFn(learnModelFromSamFileTargetFn2, 
                               args=(unnormalisedOutputModel, options))
Exemplo n.º 2
0
def paralleliseSamProcessingTargetFn(target, samFile, referenceFastaFile,
                                     outputFile, childTargetFn,
                                     followOnTargetFn, options):
    """Parallelise a computation over the alignments in a SAM file.
    """
    #Load reference sequences
    refSequences = getFastaDictionary(
        referenceFastaFile)  #Hash of names to sequences

    tempOutputFiles = []
    childCount, totalSeqLength = 0, sys.maxint
    tempExonerateFile, tempQueryFile = None, None
    tempExonerateFileHandle, tempQueryFileHandle = None, None
    refName = None

    #Read through the SAM file
    sam = pysam.Samfile(samFile, "r")

    def makeChild():
        #Add a child target to do the processing of a subset of the lines.
        if tempExonerateFile != None:
            tempExonerateFileHandle.close()
            tempQueryFileHandle.close()
            #Temporary cigar file to store the realignment
            tempOutputFiles.append(
                os.path.join(target.getGlobalTempDir(),
                             "tempOutput_%i.txt" % childCount))
            target.addChildTargetFn(childTargetFn,
                                    args=(tempExonerateFile, refName,
                                          refSequences[refName], tempQueryFile,
                                          tempOutputFiles[-1], options))

    for aR, index in zip(samIterator(sam), xrange(sys.maxint)):
        #Iterate on the sam lines realigning them in parallel
        if totalSeqLength > options.maxAlignmentLengthPerJob or \
        refName != sam.getrname(aR.reference_id):
            makeChild()
            tempExonerateFile = os.path.join(
                target.getGlobalTempDir(),
                "tempExonerateCigar_%s.cig" % childCount)
            tempExonerateFileHandle = open(tempExonerateFile, 'w')
            tempQueryFile = os.path.join(target.getGlobalTempDir(),
                                         "tempQueryCigar_%s.fa" % childCount)
            tempQueryFileHandle = open(tempQueryFile, 'w')
            childCount += 1
            totalSeqLength = 0

        tempExonerateFileHandle.write(
            getExonerateCigarFormatString(aR, sam) + "\n")
        fastaWrite(
            tempQueryFileHandle, aR.query_name, aR.query_sequence
        )  #This is the query sequence, including soft clipped bases, but excluding hard clip bases
        totalSeqLength += len(aR.query_sequence)
        refName = sam.getrname(aR.reference_id)

    makeChild()
    target.setFollowOnTargetFn(followOnTargetFn, args=(samFile, referenceFastaFile, \
                                                       outputFile, tempOutputFiles, options))
    #Finish up
    sam.close()
Exemplo n.º 3
0
def realignCigarTargetFn(target, exonerateCigarStringFile, referenceSequenceName, 
                         referenceSequence, querySequenceFile, 
                         outputCigarFile, options):
    #Temporary files
    tempRefFile = os.path.join(target.getLocalTempDir(), "ref.fa")
    tempReadFile = os.path.join(target.getLocalTempDir(), "read.fa")
    
    #Write the temporary reference file.
    fastaWrite(tempRefFile, referenceSequenceName, referenceSequence) 
    
    #For each cigar string
    for exonerateCigarString, (querySequenceName, querySequence) in \
    zip(open(exonerateCigarStringFile, "r"), fastaRead(querySequenceFile)):
        fastaWrite(tempReadFile, querySequenceName, querySequence)
        #Call to cPecanRealign
        loadHmm = nameValue("loadHmm", options.hmmFile)
        try:
            command = "echo %s | cPecanRealign %s %s --diagonalExpansion=10 \
            --splitMatrixBiggerThanThis=3000 %s --gapGamma=%s --matchGamma=%s >> %s" % \
                   (exonerateCigarString[:-1], tempRefFile, tempReadFile, loadHmm, 
                    options.gapGamma, options.matchGamma, outputCigarFile);
            system(command)
            # target.logToMaster('[good] ' + command + '\n');
        except Exception, e:            
            target.logToMaster('Caught an exception! qname = "%s"\n' % querySequenceName);
            target.logToMaster('len(exonerateCigarString[:-1]) = %d\n' % (len(exonerateCigarString[:-1])));
            target.logToMaster('[bad] Command that caused the exception:\n');
            target.logToMaster("echo %s | cPecanRealign %s %s --diagonalExpansion=10 --splitMatrixBiggerThanThis=3000 %s --gapGamma=%s --matchGamma=%s >> %s" % (exonerateCigarString[:-1], tempRefFile, tempReadFile, loadHmm, options.gapGamma, options.matchGamma, outputCigarFile));
            target.logToMaster('\n');
            target.logToMaster('\n');
            target.logToMaster(str(e) + '\n');
            target.logToMaster('\n');
            continue;
Exemplo n.º 4
0
def learnModelFromSamFileTargetFn(target, samFile, readFastqFile, 
                                  referenceFastaFile, options):
    """Does expectation maximisation on sam file to learn the hmm for the sam file.
    """
    #Get cigars and reads fasta file
    cigars = os.path.join(target.getGlobalTempDir(), "temp.cigar")
    fHCigars = open(cigars, 'w')
    reads = os.path.join(target.getGlobalTempDir(), "temp.fa")
    fHReads = open(reads, 'w')
    sam = pysam.Samfile(samFile, "r" )
    for aR, counter in zip(sam, xrange(sys.maxint)): #Iterate on the sam lines realigning them in parallel            
        aR.query_name = aR.query_name + "_%s" % counter
        fHCigars.write(getExonerateCigarFormatString(aR, sam) + "\n")
        fastaWrite(fHReads, aR.query_name, aR.seq)
    fHCigars.close(); fHReads.close()
    
    unnormalisedOutputModel = os.path.join(target.getGlobalTempDir(), 
                                           "unnormalisedOutputModel.hmm")
    target.addChildTargetFn(cPecanEm.expectationMaximisationTrials, 
                            args=(" ".join([reads, referenceFastaFile ]), cigars, 
                                  unnormalisedOutputModel, options))
    
    #Now set up normalisation
    target.setFollowOnTargetFn(learnModelFromSamFileTargetFn2, 
                               args=(unnormalisedOutputModel, options))
Exemplo n.º 5
0
    def run(self):
        AbstractAnalysis.run(self) #Call base method to do some logging
        refSequences = getFastaDictionary(self.referenceFastaFile) #Hash of names to sequences
        readSequences = getFastqDictionary(self.readFastqFile) #Hash of names to sequences
        sam = pysam.Samfile(self.samFile, "r" )

        #The data we collect
        avgPosteriorMatchProbabilityInCigar = []
        alignedPairsInCigar = []
        posteriorMatchProbabilities = []

        for aR in samIterator(sam): #Iterate on the sam lines
            #Exonerate format Cigar string
            cigarString = getExonerateCigarFormatString(aR, sam)
            
            #Temporary files
            tempCigarFile = os.path.join(self.getLocalTempDir(), "rescoredCigar.cig")
            tempRefFile = os.path.join(self.getLocalTempDir(), "ref.fa")
            tempReadFile = os.path.join(self.getLocalTempDir(), "read.fa")
            tempPosteriorProbsFile = os.path.join(self.getLocalTempDir(), "probs.tsv")
            
            #Write the temporary files.
            fastaWrite(tempRefFile, sam.getrname(aR.rname), refSequences[sam.getrname(aR.rname)]) 
            fastaWrite(tempReadFile, aR.qname, aR.query)
            
            #Trained hmm file to use.
            hmmFile = os.path.join(pathToBaseNanoporeDir(), "nanopore", "mappers", "blasr_hmm_0.txt")
            
            #Call to cactus_realign
            system("echo %s | cactus_realign %s %s --rescoreByPosteriorProbIgnoringGaps --rescoreOriginalAlignment --diagonalExpansion=10 --splitMatrixBiggerThanThis=100 --outputPosteriorProbs=%s --loadHmm=%s > %s" % \
                   (cigarString, tempRefFile, tempReadFile, tempPosteriorProbsFile, hmmFile, tempCigarFile))
            
            #Load the cigar and get the posterior prob
            assert len([ pA for pA in cigarRead(open(tempCigarFile)) ]) > 0
            assert len([ pA for pA in cigarRead(open(tempCigarFile)) ]) == 1
            pA = [ i for i in cigarRead(open(tempCigarFile)) ][0]
            avgPosteriorMatchProbabilityInCigar.append(pA.score)
            
            #Calculate the number of aligned pairs in the cigar
            alignedPairsInCigar.append(sum([ op.length for op in pA.operationList if op.type == PairwiseAlignment.PAIRWISE_MATCH ]))
            assert alignedPairsInCigar[-1] == len([ readPos for readPos, refPos in aR.aligned_pairs if readPos != None and refPos != None ])
            
            #Get the posterior probs
            #posteriorMatchProbabilities += [ float(line.split()[2]) for line in open(tempPosteriorProbsFile) ]
            
        sam.close()
        #Write out the substitution info
        node = ET.Element("alignmentUncertainty", { 
                "averagePosteriorMatchProbabilityPerRead":str(self.formatRatio(sum(avgPosteriorMatchProbabilityInCigar), len(avgPosteriorMatchProbabilityInCigar))),
                "averagePosteriorMatchProbability":str(self.formatRatio(float(sum([ avgMatchProb*alignedPairs for avgMatchProb, alignedPairs in zip(avgPosteriorMatchProbabilityInCigar, alignedPairsInCigar) ])),sum(alignedPairsInCigar))),
                "averagePosteriorMatchProbabilitesPerRead":",".join([ str(i) for i in avgPosteriorMatchProbabilityInCigar ]), 
                "alignedPairsInCigar":",".join([ str(i) for i in alignedPairsInCigar ]) })
        open(os.path.join(self.outputDir, "alignmentUncertainty.xml"), "w").write(prettyXml(node))
        if len(avgPosteriorMatchProbabilityInCigar) > 0:
            outf = open(os.path.join(self.getLocalTempDir(), "tmp_uncertainty"), "w")
            outf.write("\t".join([ str(i) for i in avgPosteriorMatchProbabilityInCigar ])); outf.write("\n")
            outf.close()
            system("Rscript nanopore/analyses/match_hist.R {} {}".format(os.path.join(self.getLocalTempDir(), "tmp_uncertainty"), os.path.join(self.outputDir, "posterior_prob_hist.pdf")))
        #Indicate everything is all done
        self.finish()
Exemplo n.º 6
0
def posteriorProbabilityCalculationTargetFn(target, exonerateCigarStringFile,
                                            referenceSequenceName,
                                            referenceSequence,
                                            querySequenceFile,
                                            outputPosteriorProbsFile, options):
    """Calculates the posterior probabilities of matches in a set of pairwise
    alignments between a reference sequence and a set of reads. 
    """
    #Temporary files
    tempRefFile = os.path.join(target.getLocalTempDir(), "ref.fa")
    tempReadFile = os.path.join(target.getLocalTempDir(), "read.fa")

    #Write the temporary reference file.
    fastaWrite(tempRefFile, referenceSequenceName, referenceSequence)

    #Hash to store posterior probabilities in
    expectationsOfBasesAtEachPosition = {}

    #For each cigar string
    for exonerateCigarString, (querySequenceName, querySequence) in \
    zip(open(exonerateCigarStringFile, "r"), fastaRead(querySequenceFile)):
        fastaWrite(tempReadFile, querySequenceName, querySequence)
        #Call to cPecanRealign
        tempPosteriorProbsFile = os.path.join(target.getLocalTempDir(),
                                              "posteriorProbs.txt")
        if options.noMargin:  #When we don't marginialize we just run cPecanRealign to get the list of aligned pairs
            #This runtime should be very fast
            system("echo %s | cPecanRealign %s %s --diagonalExpansion=0 \
            --splitMatrixBiggerThanThis=1 --rescoreOriginalAlignment --outputPosteriorProbs=%s"                                                                                                % \
                       (exonerateCigarString[:-1], tempRefFile, tempReadFile,
                        tempPosteriorProbsFile))
        else:
            system("echo %s | cPecanRealign %s %s --diagonalExpansion=10 \
            --splitMatrixBiggerThanThis=100 --outputAllPosteriorProbs=%s --loadHmm=%s"                                                                                       % \
                       (exonerateCigarString[:-1], tempRefFile, tempReadFile,
                        tempPosteriorProbsFile, options.alignmentModel))

        #Now collate the reference position expectations
        for refPosition, queryPosition, posteriorProb in \
        map(lambda x : map(float, x.split()), open(tempPosteriorProbsFile, 'r')):
            assert posteriorProb <= 1.01
            assert posteriorProb >= 0.0
            key = (referenceSequenceName, int(refPosition))
            if key not in expectationsOfBasesAtEachPosition:
                expectationsOfBasesAtEachPosition[key] = dict(
                    zip(BASES, [0.0] * len(BASES)))
            queryBase = querySequence[int(queryPosition)].upper()
            if queryBase in BASES:  #Could be an N or other wildcard character, which we ignore
                expectationsOfBasesAtEachPosition[key][
                    queryBase] += 1.0 if options.noMargin else posteriorProb

    #Pickle the posterior probs
    fileHandle = open(outputPosteriorProbsFile, 'w')
    cPickle.dump(expectationsOfBasesAtEachPosition, fileHandle,
                 cPickle.HIGHEST_PROTOCOL)
    fileHandle.close()
Exemplo n.º 7
0
def paralleliseSamProcessingTargetFn(target, samFile,
                            referenceFastaFile, outputFile,
                            childTargetFn, followOnTargetFn, options):
    """Parallelise a computation over the alignments in a SAM file.
    """
    #Load reference sequences
    refSequences = getFastaDictionary(referenceFastaFile) #Hash of names to sequences

    tempOutputFiles = []
    childCount, totalSeqLength = 0, sys.maxint
    tempExonerateFile, tempQueryFile = None, None
    tempExonerateFileHandle, tempQueryFileHandle = None, None
    refName = None

    #Read through the SAM file
    sam = pysam.Samfile(samFile, "r" )

    def makeChild():
        #Add a child target to do the processing of a subset of the lines.
        if tempExonerateFile != None:
            tempExonerateFileHandle.close()
            tempQueryFileHandle.close()
            #Temporary cigar file to store the realignment
            tempOutputFiles.append(os.path.join(target.getGlobalTempDir(),
                                               "tempOutput_%i.txt" % childCount))
            target.addChildTargetFn(childTargetFn,
                                    args=(tempExonerateFile, refName,
                                          refSequences[refName],
                                          tempQueryFile, tempOutputFiles[-1], options))

    for aR, index in zip(samIterator(sam), xrange(sys.maxint)):
        #Iterate on the sam lines realigning them in parallel
        if totalSeqLength > options.maxAlignmentLengthPerJob or \
        refName != sam.getrname(aR.reference_id):
            makeChild()
            tempExonerateFile = os.path.join(target.getGlobalTempDir(),
                                             "tempExonerateCigar_%s.cig" % childCount)
            tempExonerateFileHandle = open(tempExonerateFile, 'w')
            tempQueryFile = os.path.join(target.getGlobalTempDir(),
                                         "tempQueryCigar_%s.fa" % childCount)
            tempQueryFileHandle = open(tempQueryFile, 'w')
            childCount += 1
            totalSeqLength = 0

        tempExonerateFileHandle.write(getExonerateCigarFormatString(aR, sam) + "\n")
        fastaWrite(tempQueryFileHandle, aR.query_name, aR.query_sequence) #This is the query sequence, including soft clipped bases, but excluding hard clip bases
        totalSeqLength += len(aR.query_sequence)
        refName = sam.getrname(aR.reference_id)

    makeChild()
    target.setFollowOnTargetFn(followOnTargetFn, args=(samFile, referenceFastaFile, \
                                                       outputFile, tempOutputFiles, options))
    #Finish up
    sam.close()
Exemplo n.º 8
0
def makeFastaSequenceNamesUnique(inputFastaFile, outputFastaFile):
    """Makes a fasta file with unique names
    """
    names = set()
    fileHandle = open(outputFastaFile, 'w')
    for name, seq in fastaRead(open(inputFastaFile, 'r')):
        while name in names:
            logger.critical("Got a duplicate fasta sequence name: %s" % name)
            name += "i"
        names.add(name)
        fastaWrite(fileHandle, name, seq)
    fileHandle.close()
    return outputFastaFile
Exemplo n.º 9
0
def makeFastaSequenceNamesUnique(inputFastaFile, outputFastaFile):
    """Makes a fasta file with unique names
    """
    names = set()
    fileHandle = open(outputFastaFile, 'w')
    for name, seq in fastaRead(open(inputFastaFile, 'r')):
        while name in names:
            logger.critical("Got a duplicate fasta sequence name: %s" % name)
            name += "i"
        names.add(name)
        fastaWrite(fileHandle, name, seq)
    fileHandle.close()
    return outputFastaFile
Exemplo n.º 10
0
def realignCigarTargetFn(target, exonerateCigarString, referenceSequenceName, referenceSequence, querySequenceName, querySequence, outputCigarFile, hmmFile, gapGamma, matchGamma):
    #Temporary files
    tempRefFile = os.path.join(target.getGlobalTempDir(), "ref.fa")
    tempReadFile = os.path.join(target.getGlobalTempDir(), "read.fa")
    
    #Write the temporary files.
    fastaWrite(tempRefFile, referenceSequenceName, referenceSequence) 
    fastaWrite(tempReadFile, querySequenceName, querySequence)

    #Call to cactus_realign
    loadHmm = nameValue("loadHmm", hmmFile)
    system("echo %s | cactus_realign %s %s --diagonalExpansion=10 --splitMatrixBiggerThanThis=3000 %s --gapGamma=%s --matchGamma=%s > %s" % (exonerateCigarString, tempRefFile, tempReadFile, loadHmm, gapGamma, matchGamma, outputCigarFile))
    assert len([ pA for pA in cigarRead(open(outputCigarFile)) ]) > 0
    assert len([ pA for pA in cigarRead(open(outputCigarFile)) ]) == 1
Exemplo n.º 11
0
def posteriorProbabilityCalculationTargetFn(target, exonerateCigarStringFile, 
                referenceSequenceName, referenceSequence, querySequenceFile, 
                outputPosteriorProbsFile, options):
    """Calculates the posterior probabilities of matches in a set of pairwise
    alignments between a reference sequence and a set of reads. 
    """
    #Temporary files
    tempRefFile = os.path.join(target.getLocalTempDir(), "ref.fa")
    tempReadFile = os.path.join(target.getLocalTempDir(), "read.fa")
    
    #Write the temporary reference file.
    fastaWrite(tempRefFile, referenceSequenceName, referenceSequence) 
    
    #Hash to store posterior probabilities in
    expectationsOfBasesAtEachPosition = {}
    
    #For each cigar string
    for exonerateCigarString, (querySequenceName, querySequence) in \
    zip(open(exonerateCigarStringFile, "r"), fastaRead(querySequenceFile)):
        fastaWrite(tempReadFile, querySequenceName, querySequence)
        #Call to cPecanRealign
        tempPosteriorProbsFile = os.path.join(target.getLocalTempDir(), "posteriorProbs.txt")
        if options.noMargin: #When we don't marginialize we just run cPecanRealign to get the list of aligned pairs
            #This runtime should be very fast
            system("echo %s | cPecanRealign %s %s --diagonalExpansion=0 \
            --splitMatrixBiggerThanThis=1 --rescoreOriginalAlignment --outputPosteriorProbs=%s" % \
                       (exonerateCigarString[:-1], tempRefFile, tempReadFile, 
                        tempPosteriorProbsFile))
        else:
            system("echo %s | cPecanRealign %s %s --diagonalExpansion=10 \
            --splitMatrixBiggerThanThis=100 --outputAllPosteriorProbs=%s --loadHmm=%s" % \
                       (exonerateCigarString[:-1], tempRefFile, tempReadFile, 
                        tempPosteriorProbsFile, options.alignmentModel))
        
        #Now collate the reference position expectations
        for refPosition, queryPosition, posteriorProb in \
        map(lambda x : map(float, x.split()), open(tempPosteriorProbsFile, 'r')):
            assert posteriorProb <= 1.01
            assert posteriorProb >= 0.0
            key = (referenceSequenceName, int(refPosition))
            if key not in expectationsOfBasesAtEachPosition:
                expectationsOfBasesAtEachPosition[key] = dict(zip(BASES, [0.0]*len(BASES)))
            queryBase = querySequence[int(queryPosition)].upper()
            if queryBase in BASES: #Could be an N or other wildcard character, which we ignore
                expectationsOfBasesAtEachPosition[key][queryBase] += 1.0 if options.noMargin else posteriorProb 
            
    #Pickle the posterior probs
    fileHandle = open(outputPosteriorProbsFile, 'w')
    cPickle.dump(expectationsOfBasesAtEachPosition, fileHandle, cPickle.HIGHEST_PROTOCOL)
    fileHandle.close() 
Exemplo n.º 12
0
def realignCigarTargetFn(target, exonerateCigarString, referenceSequenceName,
                         referenceSequence, querySequenceName, querySequence,
                         outputCigarFile, hmmFile, gapGamma, matchGamma):
    #Temporary files
    tempRefFile = os.path.join(target.getGlobalTempDir(), "ref.fa")
    tempReadFile = os.path.join(target.getGlobalTempDir(), "read.fa")

    #Write the temporary files.
    fastaWrite(tempRefFile, referenceSequenceName, referenceSequence)
    fastaWrite(tempReadFile, querySequenceName, querySequence)

    #Call to cactus_realign
    loadHmm = nameValue("loadHmm", hmmFile)
    system(
        "echo %s | cactus_realign %s %s --diagonalExpansion=10 --splitMatrixBiggerThanThis=3000 %s --gapGamma=%s --matchGamma=%s > %s"
        % (exonerateCigarString, tempRefFile, tempReadFile, loadHmm, gapGamma,
           matchGamma, outputCigarFile))
    assert len([pA for pA in cigarRead(open(outputCigarFile))]) > 0
    assert len([pA for pA in cigarRead(open(outputCigarFile))]) == 1
Exemplo n.º 13
0
def realignCigarTargetFn(target, exonerateCigarStringFile, referenceSequenceName, 
                         referenceSequence, querySequenceFile, 
                         outputCigarFile, options):
    #Temporary files
    tempRefFile = os.path.join(target.getLocalTempDir(), "ref.fa")
    tempReadFile = os.path.join(target.getLocalTempDir(), "read.fa")
    
    #Write the temporary reference file.
    fastaWrite(tempRefFile, referenceSequenceName, referenceSequence) 
    
    #For each cigar string
    for exonerateCigarString, (querySequenceName, querySequence) in \
    zip(open(exonerateCigarStringFile, "r"), fastaRead(querySequenceFile)):
        fastaWrite(tempReadFile, querySequenceName, querySequence)
        #Call to cPecanRealign
        loadHmm = nameValue("loadHmm", options.hmmFile)
        system("echo \"%s\" | cPecanRealign %s %s --diagonalExpansion=10 \
        --splitMatrixBiggerThanThis=3000 %s --gapGamma=%s --matchGamma=%s >> %s" % \
               (exonerateCigarString[:-1], tempRefFile, tempReadFile, loadHmm, 
                options.gapGamma, options.matchGamma, outputCigarFile))
Exemplo n.º 14
0
def realignCigarTargetFn(target, exonerateCigarStringFile, referenceSequenceName, 
                         referenceSequence, querySequenceFile, 
                         outputCigarFile, options):
    #Temporary files
    tempRefFile = os.path.join(target.getLocalTempDir(), "ref.fa")
    tempReadFile = os.path.join(target.getLocalTempDir(), "read.fa")
    
    #Write the temporary reference file.
    fastaWrite(tempRefFile, referenceSequenceName, referenceSequence) 
    
    #For each cigar string
    for exonerateCigarString, (querySequenceName, querySequence) in \
    zip(open(exonerateCigarStringFile, "r"), fastaRead(querySequenceFile)):
        fastaWrite(tempReadFile, querySequenceName, querySequence)
        #Call to cPecanRealign
        loadHmm = nameValue("loadHmm", options.hmmFile)
        system("echo %s | cPecanRealign %s %s --diagonalExpansion=10 \
        --splitMatrixBiggerThanThis=3000 %s --gapGamma=%s --matchGamma=%s >> %s" % \
               (exonerateCigarString[:-1], tempRefFile, tempReadFile, loadHmm, 
                options.gapGamma, options.matchGamma, outputCigarFile))
Exemplo n.º 15
0
def learnModelFromSamFileTargetFn(target, samFile, readFastqFile, referenceFastaFile, outputModel):
    """Does expectation maximisation on sam file to learn the hmm for the sam file.
    """
    #Convert the read file to fasta
    refSequences = getFastaDictionary(referenceFastaFile) #Hash of names to sequences
    readSequences = getFastqDictionary(readFastqFile) #Hash of names to sequences
    
    reads = os.path.join(target.getGlobalTempDir(), "temp.fa")
    fH = open(reads, 'w')
    for name in readSequences.keys():
        seq = readSequences[name]
        fastaWrite(fH, name, seq)
        fastaWrite(fH, name + "_reverse", reverseComplement(seq))
    fH.close()
    
    #Get cigars file
    cigars = os.path.join(target.getGlobalTempDir(), "temp.cigar")
    fH = open(cigars, 'w')
    sam = pysam.Samfile(samFile, "r" )
    for aR in sam: #Iterate on the sam lines realigning them in parallel            
        #Because these are global alignments with reverse complement coordinates reversed the following should all be true
        assert aR.pos == 0
        assert aR.qstart == 0
        assert aR.qend == len(readSequences[aR.qname]) #aR.query)
        assert aR.aend == len(refSequences[sam.getrname(aR.rname)])
        assert len(aR.query) == len(readSequences[aR.qname])
        if aR.is_reverse: #Deal with reverse complements
            assert aR.query.upper() == reverseComplement(readSequences[aR.qname]).upper()
            aR.qname += "_reverse"
        else:
            assert aR.query.upper() == readSequences[aR.qname].upper()
            
        fH.write(getExonerateCigarFormatString(aR, sam) + "\n")
        #Exonerate format Cigar string, using global coordinates
        #fH.write(getGlobalAlignmentExonerateCigarFormatString(aR, sam, refSequences[sam.getrname(aR.rname)], readSequences[aR.qname]) + "\n")
    fH.close()
    
    #Run cactus_expectationMaximisation
    options = cactus_expectationMaximisation.Options()
    options.modelType="fiveStateAsymmetric" #"threeStateAsymmetric"
    options.optionsToRealign="--diagonalExpansion=10 --splitMatrixBiggerThanThis=300" 
    options.randomStart = True
    options.trials = 3
    options.outputTrialHmms = True
    options.iterations = 100
    options.maxAlignmentLengthPerJob=700000
    options.maxAlignmentLengthToSample = 50000000
    options.outputXMLModelFile = outputModel + ".xml"
    #options.updateTheBand = True
    #options.useDefaultModelAsStart = True
    #options.setJukesCantorStartingEmissions=0.3
    options.trainEmissions=True
    #options.tieEmissions = True
    
    unnormalisedOutputModel = outputModel + "_unnormalised"
    #Do training if necessary
    if not os.path.exists(unnormalisedOutputModel):
        target.addChildTargetFn(cactus_expectationMaximisation.expectationMaximisationTrials, args=(" ".join([reads, referenceFastaFile ]), cigars, unnormalisedOutputModel, options))
    
    #Now set up normalisation
    target.setFollowOnTargetFn(learnModelFromSamFileTargetFn2, args=(unnormalisedOutputModel, outputModel))
Exemplo n.º 16
0
    def run(self):
        AbstractAnalysis.run(self) #Call base method to do some logging
        refSequences = getFastaDictionary(self.referenceFastaFile) #Hash of names to sequences
        readSequences = getFastqDictionary(self.readFastqFile) #Hash of names to sequences
        
        node = ET.Element("marginAlignComparison")
        for hmmType in ("cactus", "trained_0",  "trained_20", "trained_40"): 
            for coverage in (1000000, 120, 60, 30, 10): 
                for replicate in xrange(3 if coverage < 1000000 else 1): #Do replicates, unless coverage is all
                    sam = pysam.Samfile(self.samFile, "r" )
                    
                    #Trained hmm file to use.q
                    hmmFile0 = os.path.join(pathToBaseNanoporeDir(), "nanopore", "mappers", "blasr_hmm_0.txt")
                    hmmFile20 = os.path.join(pathToBaseNanoporeDir(), "nanopore", "mappers", "blasr_hmm_20.txt")
                    hmmFile40 = os.path.join(pathToBaseNanoporeDir(), "nanopore", "mappers", "blasr_hmm_40.txt")
              
                    #Get substitution matrices
                    nullSubstitionMatrix = getNullSubstitutionMatrix()
                    flatSubstitutionMatrix = getJukesCantorTypeSubstitutionMatrix()
                    hmmErrorSubstitutionMatrix = loadHmmErrorSubstitutionMatrix(hmmFile20)
                
                    #Load the held out snps
                    snpSet = {}
                    referenceAlignmentFile = self.referenceFastaFile + "_Index.txt"
                    if os.path.exists(referenceAlignmentFile):
                        seqsAndMutatedSeqs = getFastaDictionary(referenceAlignmentFile)
                        count = 0
                        for name in seqsAndMutatedSeqs:
                            if name in refSequences:
                                count += 1
                                trueSeq = seqsAndMutatedSeqs[name]
                                mutatedSeq = seqsAndMutatedSeqs[name + "_mutated"]
                                assert mutatedSeq == refSequences[name]
                                for i in xrange(len(trueSeq)):
                                    if trueSeq[i] != mutatedSeq[i]:
                                        snpSet[(name, i)] = trueSeq[i] 
                            else:
                                assert name.split("_")[-1] == "mutated"
                        assert count == len(refSequences.keys())
                    
                    #The data we collect
                    expectationsOfBasesAtEachPosition = {}
                    frequenciesOfAlignedBasesAtEachPosition = {}
                    
                    totalSampledReads = 0
                    totalAlignedPairs = 0
                    totalReadLength = 0
                    totalReferenceLength = sum(map(len, refSequences.values()))
                    
                    #Get a randomised ordering for the reads
                    reads = [ aR for aR in samIterator(sam) ]
                    random.shuffle(reads)
                    
                    for aR in reads: #Iterate on the sam lines
                        if totalReadLength/totalReferenceLength >= coverage: #Stop when coverage exceeds the quota
                            break
                        totalReadLength += len(readSequences[aR.qname])
                        totalSampledReads += 1
                        
                        #Temporary files
                        tempCigarFile = os.path.join(self.getLocalTempDir(), "rescoredCigar.cig")
                        tempRefFile = os.path.join(self.getLocalTempDir(), "ref.fa")
                        tempReadFile = os.path.join(self.getLocalTempDir(), "read.fa")
                        tempPosteriorProbsFile = os.path.join(self.getLocalTempDir(), "probs.tsv")
                        
                        #Ref name
                        refSeqName = sam.getrname(aR.rname)
                        
                        #Sequences
                        refSeq = refSequences[sam.getrname(aR.rname)]
                        
                        #Walk through the aligned pairs to collate the bases of aligned positions
                        for aP in AlignedPair.iterator(aR, refSeq, readSequences[aR.qname]): 
                            totalAlignedPairs += 1 #Record an aligned pair
                            key = (refSeqName, aP.refPos)
                            if key not in frequenciesOfAlignedBasesAtEachPosition:
                                frequenciesOfAlignedBasesAtEachPosition[key] = dict(zip(bases, [0.0]*len(bases))) 
                            readBase = aP.getReadBase() #readSeq[aP.readPos].upper() #Use the absolute read, ins
                            if readBase in bases:
                                frequenciesOfAlignedBasesAtEachPosition[key][readBase] += 1
                        
                        #Write the temporary files.
                        readSeq = aR.query #This excludes bases that were soft-clipped and is always of positive strand coordinates
                        fastaWrite(tempRefFile, refSeqName, refSeq) 
                        fastaWrite(tempReadFile, aR.qname, readSeq)
                        
                        #Exonerate format Cigar string, which is in readSeq coordinates (positive strand).
                        assert aR.pos == 0
                        assert aR.qstart == 0
                        assert aR.qend == len(readSeq)
                        assert aR.aend == len(refSeq)

                        cigarString = getExonerateCigarFormatString(aR, sam)
                        
                        #Call to cactus_realign
                        if hmmType == "trained_0":
                            system("echo %s | cactus_realign %s %s --diagonalExpansion=10 --splitMatrixBiggerThanThis=100 --outputAllPosteriorProbs=%s --loadHmm=%s > %s" % \
                                   (cigarString, tempRefFile, tempReadFile, tempPosteriorProbsFile, hmmFile0, tempCigarFile))
                        elif hmmType == "trained_20":
                            system("echo %s | cactus_realign %s %s --diagonalExpansion=10 --splitMatrixBiggerThanThis=100 --outputAllPosteriorProbs=%s --loadHmm=%s > %s" % \
                                   (cigarString, tempRefFile, tempReadFile, tempPosteriorProbsFile, hmmFile20, tempCigarFile))
                        elif hmmType == "trained_40":
                            system("echo %s | cactus_realign %s %s --diagonalExpansion=10 --splitMatrixBiggerThanThis=100 --outputAllPosteriorProbs=%s --loadHmm=%s > %s" % \
                                   (cigarString, tempRefFile, tempReadFile, tempPosteriorProbsFile, hmmFile40, tempCigarFile))
                        else:
                            system("echo %s | cactus_realign %s %s --diagonalExpansion=10 --splitMatrixBiggerThanThis=100 --outputAllPosteriorProbs=%s > %s" % \
                                   (cigarString, tempRefFile, tempReadFile, tempPosteriorProbsFile, tempCigarFile))
                        
                        #Now collate the reference position expectations
                        for refPosition, readPosition, posteriorProb in map(lambda x : map(float, x.split()), open(tempPosteriorProbsFile, 'r')):
                            key = (refSeqName, int(refPosition))
                            if key not in expectationsOfBasesAtEachPosition:
                                expectationsOfBasesAtEachPosition[key] = dict(zip(bases, [0.0]*len(bases))) 
                            readBase = readSeq[int(readPosition)].upper()
                            if readBase in bases:
                                expectationsOfBasesAtEachPosition[key][readBase] += posteriorProb
                        
                        #Collate aligned positions from cigars
            
                    sam.close()
                    
                    totalHeldOut = len(snpSet)
                    totalNotHeldOut = totalReferenceLength - totalHeldOut
                    
                    class SnpCalls:
                        def __init__(self):
                            self.falsePositives = []
                            self.truePositives = []
                            self.falseNegatives = []
                            self.notCalled = 0
                        
                        @staticmethod
                        def bucket(calls):
                            calls = calls[:]
                            calls.sort()
                            buckets = [0.0]*101
                            for prob in calls: #Discretize
                                buckets[int(round(prob*100))] += 1
                            for i in xrange(len(buckets)-2, -1, -1): #Make cumulative
                                buckets[i] += buckets[i+1]
                            return buckets
                        
                        def getPrecisionByProbability(self):
                            tPs = self.bucket(map(lambda x : x[0], self.truePositives)) 
                            fPs = self.bucket(map(lambda x : x[0], self.falsePositives))
                            return map(lambda i : float(tPs[i]) / (tPs[i] + fPs[i]) if tPs[i] + fPs[i] != 0 else 0, xrange(len(tPs)))
                        
                        def getRecallByProbability(self):
                            return map(lambda i : i/totalHeldOut if totalHeldOut != 0 else 0, self.bucket(map(lambda x : x[0], self.truePositives)))
                        
                        def getTruePositiveLocations(self):
                            return map(lambda x : x[1], self.truePositives)
                        
                        def getFalsePositiveLocations(self):
                            return map(lambda x : x[1], self.falsePositives)
                        
                        def getFalseNegativeLocations(self):
                            return map(lambda x : x[0], self.falseNegatives)
            
                    #The different call sets
                    marginAlignMaxExpectedSnpCalls = SnpCalls()
                    marginAlignMaxLikelihoodSnpCalls = SnpCalls()
                    maxFrequencySnpCalls = SnpCalls()
                    maximumLikelihoodSnpCalls = SnpCalls()
                    
                    #Now calculate the calls
                    for refSeqName in refSequences:
                        refSeq = refSequences[refSeqName]
                        for refPosition in xrange(len(refSeq)):
                            mutatedRefBase = refSeq[refPosition].upper()
                            trueRefBase = (mutatedRefBase if not (refSeqName, refPosition) in snpSet else snpSet[(refSeqName, refPosition)]).upper()
                            key = (refSeqName, refPosition)
                            
                            
                            #Get base calls
                            for errorSubstitutionMatrix, evolutionarySubstitutionMatrix, baseExpectations, snpCalls in \
                            ((flatSubstitutionMatrix, nullSubstitionMatrix, expectationsOfBasesAtEachPosition, marginAlignMaxExpectedSnpCalls),
                             (hmmErrorSubstitutionMatrix, nullSubstitionMatrix, expectationsOfBasesAtEachPosition, marginAlignMaxLikelihoodSnpCalls),
                             (flatSubstitutionMatrix, nullSubstitionMatrix, frequenciesOfAlignedBasesAtEachPosition, maxFrequencySnpCalls),
                             (hmmErrorSubstitutionMatrix, nullSubstitionMatrix, frequenciesOfAlignedBasesAtEachPosition, maximumLikelihoodSnpCalls)):
                                
                                if key in baseExpectations:
                                    #Get posterior likelihoods
                                    expectations = baseExpectations[key]
                                    totalExpectation = sum(expectations.values())
                                    if totalExpectation > 0.0: #expectationCallingThreshold:
                                        posteriorProbs = calcBasePosteriorProbs(dict(zip(bases, map(lambda x : float(expectations[x])/totalExpectation, bases))), mutatedRefBase, 
                                                               evolutionarySubstitutionMatrix, errorSubstitutionMatrix)
                                        probs = [ posteriorProbs[base] for base in "ACGT" ]
                                        #posteriorProbs.pop(mutatedRefBase) #Remove the ref base.
                                        #maxPosteriorProb = max(posteriorProbs.values())
                                        #chosenBase = random.choice([ base for base in posteriorProbs if posteriorProbs[base] == maxPosteriorProb ]).upper() #Very naive way to call the base

                                        for chosenBase in "ACGT":
                                            if chosenBase != mutatedRefBase:
                                                maxPosteriorProb = posteriorProbs[chosenBase]
                                                if trueRefBase != mutatedRefBase and trueRefBase == chosenBase:
                                                    snpCalls.truePositives.append((maxPosteriorProb, refPosition)) #True positive
                                                else:
                                                    snpCalls.falsePositives.append((maxPosteriorProb, refPosition)) #False positive
                                                """
                                                    snpCalls.falseNegatives.append((refPosition, trueRefBase, mutatedRefBase, probs)) #False negative
                                                if trueRefBase != mutatedRefBase:
                                                    if trueRefBase == chosenBase:
                                                        snpCalls.truePositives.append((maxPosteriorProb, refPosition)) #True positive
                                                    else:
                                                        snpCalls.falseNegatives.append((refPosition, trueRefBase, mutatedRefBase, probs)) #False negative
                                                else:
                                                    snpCalls.falsePositives.append((maxPosteriorProb, refPosition)) #False positive
                                                """
                                else:
                                    snpCalls.notCalled += 1
                        
                    #Now find max-fscore point
                    
                    
                    for snpCalls, tagName in ((marginAlignMaxExpectedSnpCalls, "marginAlignMaxExpectedSnpCalls"), 
                                              (marginAlignMaxLikelihoodSnpCalls, "marginAlignMaxLikelihoodSnpCalls"),
                                              (maxFrequencySnpCalls, "maxFrequencySnpCalls"),
                                              (maximumLikelihoodSnpCalls, "maximumLikelihoodSnpCalls")):
                        recall = snpCalls.getRecallByProbability()
                        precision = snpCalls.getPrecisionByProbability()
                        assert len(recall) == len(precision)
                        fScore, pIndex = max(map(lambda i : (2 * recall[i] * precision[i] / (recall[i] + precision[i]) if recall[i] + precision[i] > 0 else 0.0, i), range(len(recall))))
                        truePositives = snpCalls.getRecallByProbability()[pIndex]
                        falsePositives = snpCalls.getPrecisionByProbability()[pIndex]
                        optimumProbThreshold = float(pIndex)/100.0
                        
                        #Write out the substitution info
                        node2 = ET.SubElement(node, tagName + "_" + hmmType, {  
                                "coverage":str(coverage),
                                "actualCoverage":str(float(totalAlignedPairs)/totalReferenceLength),
                                "totalAlignedPairs":str(totalAlignedPairs),
                                "totalReferenceLength":str(totalReferenceLength),
                                "replicate":str(replicate),
                                "totalReads":str(len(reads)),
                                "avgSampledReadLength":str(float(totalReadLength)/totalSampledReads),
                                "totalSampledReads":str(totalSampledReads),
                                
                                "totalHeldOut":str(totalHeldOut),
                                "totalNonHeldOut":str(totalNotHeldOut),
                                
                                "recall":str(recall[pIndex]),
                                "precision":str(precision[pIndex]),
                                "fScore":str(fScore),
                                "optimumProbThreshold":str(optimumProbThreshold),
                                "totalNoCalls":str(snpCalls.notCalled),

                                "recallByProbability":" ".join(map(str, snpCalls.getRecallByProbability())),
                                "precisionByProbability":" ".join(map(str, snpCalls.getPrecisionByProbability())) })
                                
                                #"falsePositiveLocations":" ".join(map(str, snpCalls.getFalsePositiveLocations())),
                                #"falseNegativeLocations":" ".join(map(str, snpCalls.getFalseNegativeLocations())),
                                #"truePositiveLocations":" ".join(map(str, snpCalls.getTruePositiveLocations())) })
                        for refPosition, trueRefBase, mutatedRefBase, posteriorProbs in snpCalls.falseNegatives:
                            ET.SubElement(node2, "falseNegative_%s_%s" % (trueRefBase, mutatedRefBase), { "posteriorProbs":" ".join(map(str, posteriorProbs))})
                        for falseNegativeBase in bases:
                            for mutatedBase in bases:
                                posteriorProbsArray = [ posteriorProbs for refPosition, trueRefBase, mutatedRefBase, posteriorProbs in snpCalls.falseNegatives if (trueRefBase.upper() == falseNegativeBase.upper() and mutatedBase.upper() == mutatedRefBase.upper() ) ]
                                if len(posteriorProbsArray) > 0:
                                    summedProbs = reduce(lambda x, y : map(lambda i : x[i] + y[i], xrange(len(x))), posteriorProbsArray)
                                    summedProbs = map(lambda x : float(x)/sum(summedProbs), summedProbs)
                                    ET.SubElement(node2, "combinedFalseNegative_%s_%s" % (falseNegativeBase, mutatedBase), { "posteriorProbs":" ".join(map(str, summedProbs))})
                        
        open(os.path.join(self.outputDir, "marginaliseConsensus.xml"), "w").write(prettyXml(node))
        
        
        #Indicate everything is all done
        self.finish()
Exemplo n.º 17
0
    def run(self):
        AbstractAnalysis.run(self)  #Call base method to do some logging
        refSequences = getFastaDictionary(
            self.referenceFastaFile)  #Hash of names to sequences
        readSequences = getFastqDictionary(
            self.readFastqFile)  #Hash of names to sequences

        node = ET.Element("marginAlignComparison")
        for hmmType in ("cactus", "trained_0", "trained_20", "trained_40"):
            for coverage in (1000000, 120, 60, 30, 10):
                for replicate in xrange(
                        3 if coverage < 1000000 else 1
                ):  #Do replicates, unless coverage is all
                    sam = pysam.Samfile(self.samFile, "r")

                    #Trained hmm file to use.q
                    hmmFile0 = os.path.join(pathToBaseNanoporeDir(),
                                            "nanopore", "mappers",
                                            "blasr_hmm_0.txt")
                    hmmFile20 = os.path.join(pathToBaseNanoporeDir(),
                                             "nanopore", "mappers",
                                             "blasr_hmm_20.txt")
                    hmmFile40 = os.path.join(pathToBaseNanoporeDir(),
                                             "nanopore", "mappers",
                                             "blasr_hmm_40.txt")

                    #Get substitution matrices
                    nullSubstitionMatrix = getNullSubstitutionMatrix()
                    flatSubstitutionMatrix = getJukesCantorTypeSubstitutionMatrix(
                    )
                    hmmErrorSubstitutionMatrix = loadHmmErrorSubstitutionMatrix(
                        hmmFile20)

                    #Load the held out snps
                    snpSet = {}
                    referenceAlignmentFile = self.referenceFastaFile + "_Index.txt"
                    if os.path.exists(referenceAlignmentFile):
                        seqsAndMutatedSeqs = getFastaDictionary(
                            referenceAlignmentFile)
                        count = 0
                        for name in seqsAndMutatedSeqs:
                            if name in refSequences:
                                count += 1
                                trueSeq = seqsAndMutatedSeqs[name]
                                mutatedSeq = seqsAndMutatedSeqs[name +
                                                                "_mutated"]
                                assert mutatedSeq == refSequences[name]
                                for i in xrange(len(trueSeq)):
                                    if trueSeq[i] != mutatedSeq[i]:
                                        snpSet[(name, i)] = trueSeq[i]
                            else:
                                assert name.split("_")[-1] == "mutated"
                        assert count == len(refSequences.keys())

                    #The data we collect
                    expectationsOfBasesAtEachPosition = {}
                    frequenciesOfAlignedBasesAtEachPosition = {}

                    totalSampledReads = 0
                    totalAlignedPairs = 0
                    totalReadLength = 0
                    totalReferenceLength = sum(map(len, refSequences.values()))

                    #Get a randomised ordering for the reads
                    reads = [aR for aR in samIterator(sam)]
                    random.shuffle(reads)

                    for aR in reads:  #Iterate on the sam lines
                        if totalReadLength / totalReferenceLength >= coverage:  #Stop when coverage exceeds the quota
                            break
                        totalReadLength += len(readSequences[aR.qname])
                        totalSampledReads += 1

                        #Temporary files
                        tempCigarFile = os.path.join(self.getLocalTempDir(),
                                                     "rescoredCigar.cig")
                        tempRefFile = os.path.join(self.getLocalTempDir(),
                                                   "ref.fa")
                        tempReadFile = os.path.join(self.getLocalTempDir(),
                                                    "read.fa")
                        tempPosteriorProbsFile = os.path.join(
                            self.getLocalTempDir(), "probs.tsv")

                        #Ref name
                        refSeqName = sam.getrname(aR.rname)

                        #Sequences
                        refSeq = refSequences[sam.getrname(aR.rname)]

                        #Walk through the aligned pairs to collate the bases of aligned positions
                        for aP in AlignedPair.iterator(
                                aR, refSeq, readSequences[aR.qname]):
                            totalAlignedPairs += 1  #Record an aligned pair
                            key = (refSeqName, aP.refPos)
                            if key not in frequenciesOfAlignedBasesAtEachPosition:
                                frequenciesOfAlignedBasesAtEachPosition[
                                    key] = dict(zip(bases, [0.0] * len(bases)))
                            readBase = aP.getReadBase(
                            )  #readSeq[aP.readPos].upper() #Use the absolute read, ins
                            if readBase in bases:
                                frequenciesOfAlignedBasesAtEachPosition[key][
                                    readBase] += 1

                        #Write the temporary files.
                        readSeq = aR.query  #This excludes bases that were soft-clipped and is always of positive strand coordinates
                        fastaWrite(tempRefFile, refSeqName, refSeq)
                        fastaWrite(tempReadFile, aR.qname, readSeq)

                        #Exonerate format Cigar string, which is in readSeq coordinates (positive strand).
                        assert aR.pos == 0
                        assert aR.qstart == 0
                        assert aR.qend == len(readSeq)
                        assert aR.aend == len(refSeq)

                        cigarString = getExonerateCigarFormatString(aR, sam)

                        #Call to cactus_realign
                        if hmmType == "trained_0":
                            system("echo %s | cactus_realign %s %s --diagonalExpansion=10 --splitMatrixBiggerThanThis=100 --outputAllPosteriorProbs=%s --loadHmm=%s > %s" % \
                                   (cigarString, tempRefFile, tempReadFile, tempPosteriorProbsFile, hmmFile0, tempCigarFile))
                        elif hmmType == "trained_20":
                            system("echo %s | cactus_realign %s %s --diagonalExpansion=10 --splitMatrixBiggerThanThis=100 --outputAllPosteriorProbs=%s --loadHmm=%s > %s" % \
                                   (cigarString, tempRefFile, tempReadFile, tempPosteriorProbsFile, hmmFile20, tempCigarFile))
                        elif hmmType == "trained_40":
                            system("echo %s | cactus_realign %s %s --diagonalExpansion=10 --splitMatrixBiggerThanThis=100 --outputAllPosteriorProbs=%s --loadHmm=%s > %s" % \
                                   (cigarString, tempRefFile, tempReadFile, tempPosteriorProbsFile, hmmFile40, tempCigarFile))
                        else:
                            system("echo %s | cactus_realign %s %s --diagonalExpansion=10 --splitMatrixBiggerThanThis=100 --outputAllPosteriorProbs=%s > %s" % \
                                   (cigarString, tempRefFile, tempReadFile, tempPosteriorProbsFile, tempCigarFile))

                        #Now collate the reference position expectations
                        for refPosition, readPosition, posteriorProb in map(
                                lambda x: map(float, x.split()),
                                open(tempPosteriorProbsFile, 'r')):
                            key = (refSeqName, int(refPosition))
                            if key not in expectationsOfBasesAtEachPosition:
                                expectationsOfBasesAtEachPosition[key] = dict(
                                    zip(bases, [0.0] * len(bases)))
                            readBase = readSeq[int(readPosition)].upper()
                            if readBase in bases:
                                expectationsOfBasesAtEachPosition[key][
                                    readBase] += posteriorProb

                        #Collate aligned positions from cigars

                    sam.close()

                    totalHeldOut = len(snpSet)
                    totalNotHeldOut = totalReferenceLength - totalHeldOut

                    class SnpCalls:
                        def __init__(self):
                            self.falsePositives = []
                            self.truePositives = []
                            self.falseNegatives = []
                            self.notCalled = 0

                        @staticmethod
                        def bucket(calls):
                            calls = calls[:]
                            calls.sort()
                            buckets = [0.0] * 101
                            for prob in calls:  #Discretize
                                buckets[int(round(prob * 100))] += 1
                            for i in xrange(len(buckets) - 2, -1,
                                            -1):  #Make cumulative
                                buckets[i] += buckets[i + 1]
                            return buckets

                        def getPrecisionByProbability(self):
                            tPs = self.bucket(
                                map(lambda x: x[0], self.truePositives))
                            fPs = self.bucket(
                                map(lambda x: x[0], self.falsePositives))
                            return map(
                                lambda i: float(tPs[i]) / (tPs[i] + fPs[i])
                                if tPs[i] + fPs[i] != 0 else 0,
                                xrange(len(tPs)))

                        def getRecallByProbability(self):
                            return map(
                                lambda i: i / totalHeldOut
                                if totalHeldOut != 0 else 0,
                                self.bucket(
                                    map(lambda x: x[0], self.truePositives)))

                        def getTruePositiveLocations(self):
                            return map(lambda x: x[1], self.truePositives)

                        def getFalsePositiveLocations(self):
                            return map(lambda x: x[1], self.falsePositives)

                        def getFalseNegativeLocations(self):
                            return map(lambda x: x[0], self.falseNegatives)

                    #The different call sets
                    marginAlignMaxExpectedSnpCalls = SnpCalls()
                    marginAlignMaxLikelihoodSnpCalls = SnpCalls()
                    maxFrequencySnpCalls = SnpCalls()
                    maximumLikelihoodSnpCalls = SnpCalls()

                    #Now calculate the calls
                    for refSeqName in refSequences:
                        refSeq = refSequences[refSeqName]
                        for refPosition in xrange(len(refSeq)):
                            mutatedRefBase = refSeq[refPosition].upper()
                            trueRefBase = (
                                mutatedRefBase
                                if not (refSeqName, refPosition) in snpSet else
                                snpSet[(refSeqName, refPosition)]).upper()
                            key = (refSeqName, refPosition)

                            #Get base calls
                            for errorSubstitutionMatrix, evolutionarySubstitutionMatrix, baseExpectations, snpCalls in \
                            ((flatSubstitutionMatrix, nullSubstitionMatrix, expectationsOfBasesAtEachPosition, marginAlignMaxExpectedSnpCalls),
                             (hmmErrorSubstitutionMatrix, nullSubstitionMatrix, expectationsOfBasesAtEachPosition, marginAlignMaxLikelihoodSnpCalls),
                             (flatSubstitutionMatrix, nullSubstitionMatrix, frequenciesOfAlignedBasesAtEachPosition, maxFrequencySnpCalls),
                             (hmmErrorSubstitutionMatrix, nullSubstitionMatrix, frequenciesOfAlignedBasesAtEachPosition, maximumLikelihoodSnpCalls)):

                                if key in baseExpectations:
                                    #Get posterior likelihoods
                                    expectations = baseExpectations[key]
                                    totalExpectation = sum(
                                        expectations.values())
                                    if totalExpectation > 0.0:  #expectationCallingThreshold:
                                        posteriorProbs = calcBasePosteriorProbs(
                                            dict(
                                                zip(
                                                    bases,
                                                    map(
                                                        lambda x: float(
                                                            expectations[x]) /
                                                        totalExpectation,
                                                        bases))),
                                            mutatedRefBase,
                                            evolutionarySubstitutionMatrix,
                                            errorSubstitutionMatrix)
                                        probs = [
                                            posteriorProbs[base]
                                            for base in "ACGT"
                                        ]
                                        #posteriorProbs.pop(mutatedRefBase) #Remove the ref base.
                                        #maxPosteriorProb = max(posteriorProbs.values())
                                        #chosenBase = random.choice([ base for base in posteriorProbs if posteriorProbs[base] == maxPosteriorProb ]).upper() #Very naive way to call the base

                                        for chosenBase in "ACGT":
                                            if chosenBase != mutatedRefBase:
                                                maxPosteriorProb = posteriorProbs[
                                                    chosenBase]
                                                if trueRefBase != mutatedRefBase and trueRefBase == chosenBase:
                                                    snpCalls.truePositives.append(
                                                        (maxPosteriorProb,
                                                         refPosition
                                                         ))  #True positive
                                                else:
                                                    snpCalls.falsePositives.append(
                                                        (maxPosteriorProb,
                                                         refPosition
                                                         ))  #False positive
                                                """
                                                    snpCalls.falseNegatives.append((refPosition, trueRefBase, mutatedRefBase, probs)) #False negative
                                                if trueRefBase != mutatedRefBase:
                                                    if trueRefBase == chosenBase:
                                                        snpCalls.truePositives.append((maxPosteriorProb, refPosition)) #True positive
                                                    else:
                                                        snpCalls.falseNegatives.append((refPosition, trueRefBase, mutatedRefBase, probs)) #False negative
                                                else:
                                                    snpCalls.falsePositives.append((maxPosteriorProb, refPosition)) #False positive
                                                """
                                else:
                                    snpCalls.notCalled += 1

                    #Now find max-fscore point

                    for snpCalls, tagName in (
                        (marginAlignMaxExpectedSnpCalls,
                         "marginAlignMaxExpectedSnpCalls"),
                        (marginAlignMaxLikelihoodSnpCalls,
                         "marginAlignMaxLikelihoodSnpCalls"),
                        (maxFrequencySnpCalls, "maxFrequencySnpCalls"),
                        (maximumLikelihoodSnpCalls,
                         "maximumLikelihoodSnpCalls")):
                        recall = snpCalls.getRecallByProbability()
                        precision = snpCalls.getPrecisionByProbability()
                        assert len(recall) == len(precision)
                        fScore, pIndex = max(
                            map(
                                lambda i:
                                (2 * recall[i] * precision[i] /
                                 (recall[i] + precision[i])
                                 if recall[i] + precision[i] > 0 else 0.0, i),
                                range(len(recall))))
                        truePositives = snpCalls.getRecallByProbability(
                        )[pIndex]
                        falsePositives = snpCalls.getPrecisionByProbability(
                        )[pIndex]
                        optimumProbThreshold = float(pIndex) / 100.0

                        #Write out the substitution info
                        node2 = ET.SubElement(
                            node, tagName + "_" + hmmType, {
                                "coverage":
                                str(coverage),
                                "actualCoverage":
                                str(
                                    float(totalAlignedPairs) /
                                    totalReferenceLength),
                                "totalAlignedPairs":
                                str(totalAlignedPairs),
                                "totalReferenceLength":
                                str(totalReferenceLength),
                                "replicate":
                                str(replicate),
                                "totalReads":
                                str(len(reads)),
                                "avgSampledReadLength":
                                str(
                                    float(totalReadLength) /
                                    totalSampledReads),
                                "totalSampledReads":
                                str(totalSampledReads),
                                "totalHeldOut":
                                str(totalHeldOut),
                                "totalNonHeldOut":
                                str(totalNotHeldOut),
                                "recall":
                                str(recall[pIndex]),
                                "precision":
                                str(precision[pIndex]),
                                "fScore":
                                str(fScore),
                                "optimumProbThreshold":
                                str(optimumProbThreshold),
                                "totalNoCalls":
                                str(snpCalls.notCalled),
                                "recallByProbability":
                                " ".join(
                                    map(str,
                                        snpCalls.getRecallByProbability())),
                                "precisionByProbability":
                                " ".join(
                                    map(str,
                                        snpCalls.getPrecisionByProbability()))
                            })

                        #"falsePositiveLocations":" ".join(map(str, snpCalls.getFalsePositiveLocations())),
                        #"falseNegativeLocations":" ".join(map(str, snpCalls.getFalseNegativeLocations())),
                        #"truePositiveLocations":" ".join(map(str, snpCalls.getTruePositiveLocations())) })
                        for refPosition, trueRefBase, mutatedRefBase, posteriorProbs in snpCalls.falseNegatives:
                            ET.SubElement(
                                node2, "falseNegative_%s_%s" %
                                (trueRefBase, mutatedRefBase), {
                                    "posteriorProbs":
                                    " ".join(map(str, posteriorProbs))
                                })
                        for falseNegativeBase in bases:
                            for mutatedBase in bases:
                                posteriorProbsArray = [
                                    posteriorProbs for refPosition,
                                    trueRefBase, mutatedRefBase, posteriorProbs
                                    in snpCalls.falseNegatives
                                    if (trueRefBase.upper() ==
                                        falseNegativeBase.upper()
                                        and mutatedBase.upper() ==
                                        mutatedRefBase.upper())
                                ]
                                if len(posteriorProbsArray) > 0:
                                    summedProbs = reduce(
                                        lambda x, y: map(
                                            lambda i: x[i] + y[i],
                                            xrange(len(x))),
                                        posteriorProbsArray)
                                    summedProbs = map(
                                        lambda x: float(x) / sum(summedProbs),
                                        summedProbs)
                                    ET.SubElement(
                                        node2, "combinedFalseNegative_%s_%s" %
                                        (falseNegativeBase, mutatedBase), {
                                            "posteriorProbs":
                                            " ".join(map(str, summedProbs))
                                        })

        open(os.path.join(self.outputDir, "marginaliseConsensus.xml"),
             "w").write(prettyXml(node))

        #Indicate everything is all done
        self.finish()
Exemplo n.º 18
0
def learnModelFromSamFileTargetFn(target, samFile, readFastqFile,
                                  referenceFastaFile, outputModel):
    """Does expectation maximisation on sam file to learn the hmm for the sam file.
    """
    #Convert the read file to fasta
    refSequences = getFastaDictionary(
        referenceFastaFile)  #Hash of names to sequences
    readSequences = getFastqDictionary(
        readFastqFile)  #Hash of names to sequences

    reads = os.path.join(target.getGlobalTempDir(), "temp.fa")
    fH = open(reads, 'w')
    for name in readSequences.keys():
        seq = readSequences[name]
        fastaWrite(fH, name, seq)
        fastaWrite(fH, name + "_reverse", reverseComplement(seq))
    fH.close()

    #Get cigars file
    cigars = os.path.join(target.getGlobalTempDir(), "temp.cigar")
    fH = open(cigars, 'w')
    sam = pysam.Samfile(samFile, "r")
    for aR in sam:  #Iterate on the sam lines realigning them in parallel
        #Because these are global alignments with reverse complement coordinates reversed the following should all be true
        assert aR.pos == 0
        assert aR.qstart == 0
        assert aR.qend == len(readSequences[aR.qname])  #aR.query)
        assert aR.aend == len(refSequences[sam.getrname(aR.rname)])
        assert len(aR.query) == len(readSequences[aR.qname])
        if aR.is_reverse:  #Deal with reverse complements
            assert aR.query.upper() == reverseComplement(
                readSequences[aR.qname]).upper()
            aR.qname += "_reverse"
        else:
            assert aR.query.upper() == readSequences[aR.qname].upper()

        fH.write(getExonerateCigarFormatString(aR, sam) + "\n")
        #Exonerate format Cigar string, using global coordinates
        #fH.write(getGlobalAlignmentExonerateCigarFormatString(aR, sam, refSequences[sam.getrname(aR.rname)], readSequences[aR.qname]) + "\n")
    fH.close()

    #Run cactus_expectationMaximisation
    options = cactus_expectationMaximisation.Options()
    options.modelType = "fiveStateAsymmetric"  #"threeStateAsymmetric"
    options.optionsToRealign = "--diagonalExpansion=10 --splitMatrixBiggerThanThis=300"
    options.randomStart = True
    options.trials = 3
    options.outputTrialHmms = True
    options.iterations = 100
    options.maxAlignmentLengthPerJob = 700000
    options.maxAlignmentLengthToSample = 50000000
    options.outputXMLModelFile = outputModel + ".xml"
    #options.updateTheBand = True
    #options.useDefaultModelAsStart = True
    #options.setJukesCantorStartingEmissions=0.3
    options.trainEmissions = True
    #options.tieEmissions = True

    unnormalisedOutputModel = outputModel + "_unnormalised"
    #Do training if necessary
    if not os.path.exists(unnormalisedOutputModel):
        target.addChildTargetFn(
            cactus_expectationMaximisation.expectationMaximisationTrials,
            args=(" ".join([reads, referenceFastaFile]), cigars,
                  unnormalisedOutputModel, options))

    #Now set up normalisation
    target.setFollowOnTargetFn(learnModelFromSamFileTargetFn2,
                               args=(unnormalisedOutputModel, outputModel))