def expectationMaximisation(target, sequences, alignments, outputModel, options): #Iteratively run cPecanRealign to get expectations and load model. if options.inputModel != None: #Read in the model target.logToMaster("Loading the model from the input file %s" % options.inputModel) hmm = Hmm.loadHmm(options.inputModel) target.logToMaster("Loaded the model, has type %s" % hmm.modelType) hmm.normalise() else: target.logToMaster("Making model of type %s" % options.modelType) hmm = Hmm(options.modelType) if options.randomStart: #Make random parameters target.logToMaster("Using random starting parameters") hmm.randomise() else: hmm.equalise() if options.setJukesCantorStartingEmissions != None: hmm.setEmissionsToJukesCantor(float(options.setJukesCantorStartingEmissions)) #Write out the first version of the output model hmm.write(outputModel) #Make a set of split alignment files alignmentsLength = 0 splitAlignmentFiles = [] fH = None for cigar in cigarRead(alignments): if fH == None: splitAlignmentFiles.append(os.path.join(target.getGlobalTempDir(), "alignments_%s.cigar" % len(splitAlignmentFiles))) fH = open(splitAlignmentFiles[-1], 'w') alignmentsLength += (abs(cigar.start1 - cigar.end1) + abs(cigar.start2 - cigar.end2))/2.0 cigarWrite(fH, cigar) if alignmentsLength > options.maxAlignmentLengthPerJob: fH.close() fH = None splitAlignmentFiles[-1] = (splitAlignmentFiles[-1], alignmentsLength) alignmentsLength = 0 if fH != None: fH.close() splitAlignmentFiles[-1] = (splitAlignmentFiles[-1], alignmentsLength) #Sample the alignment files so that we do EM on no more than options.maxAlignmentLengthToSample bases random.shuffle(splitAlignmentFiles) #This ensures we don't just take the first N alignments in the provided alignments file sampledSplitAlignmentFiles = [] totalSampledAlignmentLength = 0.0 for alignmentsFile, alignmentsLength in splitAlignmentFiles: totalSampledAlignmentLength += alignmentsLength sampledSplitAlignmentFiles.append(alignmentsFile) if totalSampledAlignmentLength >= options.maxAlignmentLengthToSample: break target.logToMaster("We sampled: %s bases of alignment length and %s alignment files, of a possible %s base and %s files" % \ (totalSampledAlignmentLength, len(sampledSplitAlignmentFiles), sum(map(lambda x : x[1], splitAlignmentFiles)), len(splitAlignmentFiles))) splitAlignmentFiles = sampledSplitAlignmentFiles #Files to store expectations in expectationsFiles = map(lambda i : os.path.join(target.getGlobalTempDir(), "expectation_%i.txt" % i), xrange(len(splitAlignmentFiles))) assert len(splitAlignmentFiles) == len(expectationsFiles) target.setFollowOnTargetFn(expectationMaximisation2, args=(sequences, splitAlignmentFiles, outputModel, expectationsFiles, 0, [], options))
def upconvertCoords(cigarPath, fastaPath, contigNum, outputFile): """Convert the coordinates of the given alignment, so that the alignment refers to a set of trimmed sequences originating from a contig rather than to the contig itself.""" with open(fastaPath) as f: seqRanges = getSequenceRanges(f) validateRanges(seqRanges) sortedCigarPath = sortCigarByContigAndPos(cigarPath, contigNum) sortedCigarFile = open(sortedCigarPath) currentContig = None currentRangeIdx = None currentRange = None for alignment in cigarRead(sortedCigarFile): # contig1 and contig2 are reversed in python api!! contig = alignment.contig2 if contigNum == 1 else alignment.contig1 minPos = min(alignment.start2, alignment.end2) if contigNum == 1 else min( alignment.start1, alignment.end1) maxPos = max(alignment.start2, alignment.end2) if contigNum == 1 else max( alignment.start1, alignment.end1) if contig in seqRanges: if contig != currentContig: currentContig = contig currentRangeIdx = 0 currentRange = seqRanges[contig][0] while (minPos >= currentRange[1] or minPos < currentRange[0] ) and currentRangeIdx < len(seqRanges[contig]) - 1: currentRangeIdx += 1 currentRange = seqRanges[contig][currentRangeIdx] if currentRange[0] <= minPos < currentRange[1]: if maxPos - 1 > currentRange[1]: raise RuntimeError("alignment on %s:%d-%d crosses " "trimmed sequence boundary" %\ (contig, minPos, maxPos)) if contigNum == 1: alignment.start2 -= currentRange[0] alignment.end2 -= currentRange[0] alignment.contig2 = contig + ("|%d" % currentRange[0]) else: alignment.start1 -= currentRange[0] alignment.end1 -= currentRange[0] alignment.contig1 = contig + ("|%d" % currentRange[0]) else: raise RuntimeError("No trimmed sequence containing alignment " "on %s:%d-%d" % (contig, minPos, maxPos)) cigarWrite(outputFile, alignment, False) os.remove(sortedCigarPath)
def loadResults(resultsFile): """Puts the results in a set. """ pairsSet = set() fileHandle = open(resultsFile, 'r') totalHits = 0 for pairwiseAlignment in cigarRead(fileHandle): totalHits += 1 i = pairwiseAlignment.start1 s1 = 1 if not pairwiseAlignment.strand1: i -= 1 s1 = -1 j = pairwiseAlignment.start2 s2 = 1 if not pairwiseAlignment.strand2: j -= 1 s2 = -1 for operation in pairwiseAlignment.operationList: if operation.type == PairwiseAlignment.PAIRWISE_INDEL_X: i += operation.length * s1 elif operation.type == PairwiseAlignment.PAIRWISE_INDEL_Y: j += operation.length * s2 else: assert operation.type == PairwiseAlignment.PAIRWISE_MATCH for k in xrange(operation.length): if pairwiseAlignment.contig1 <= pairwiseAlignment.contig2: if pairwiseAlignment.contig1 != pairwiseAlignment.contig2 or i != j: #Avoid self alignments pairsSet.add((pairwiseAlignment.contig1, i, pairwiseAlignment.contig2, j)) else: pairsSet.add((pairwiseAlignment.contig2, j, pairwiseAlignment.contig1, i)) i += s1 j += s2 if pairwiseAlignment.strand1: assert i == pairwiseAlignment.end1 else: assert i == pairwiseAlignment.end1 - 1 if pairwiseAlignment.strand2: assert j == pairwiseAlignment.end2 else: assert j == pairwiseAlignment.end2 - 1 #assert j == pairwiseAlignment.end2 fileHandle.close() return (pairsSet, totalHits)
def loadResults(resultsFile): """Puts the results in a set. """ pairsSet = set() fileHandle = open(resultsFile, 'r') totalHits = 0 for pairwiseAlignment in cigarRead(fileHandle): totalHits +=1 i = pairwiseAlignment.start1 s1 = 1 if not pairwiseAlignment.strand1: i -= 1 s1 = -1 j = pairwiseAlignment.start2 s2 = 1 if not pairwiseAlignment.strand2: j -= 1 s2 = -1 for operation in pairwiseAlignment.operationList: if operation.type == PairwiseAlignment.PAIRWISE_INDEL_X: i += operation.length * s1 elif operation.type == PairwiseAlignment.PAIRWISE_INDEL_Y: j += operation.length * s2 else: assert operation.type == PairwiseAlignment.PAIRWISE_MATCH for k in xrange(operation.length): if pairwiseAlignment.contig1 <= pairwiseAlignment.contig2: if pairwiseAlignment.contig1 != pairwiseAlignment.contig2 or i != j: #Avoid self alignments pairsSet.add((pairwiseAlignment.contig1, i, pairwiseAlignment.contig2, j)) else: pairsSet.add((pairwiseAlignment.contig2, j, pairwiseAlignment.contig1, i)) i += s1 j += s2 if pairwiseAlignment.strand1: assert i == pairwiseAlignment.end1 else: assert i == pairwiseAlignment.end1-1 if pairwiseAlignment.strand2: assert j == pairwiseAlignment.end2 else: assert j == pairwiseAlignment.end2-1 #assert j == pairwiseAlignment.end2 fileHandle.close() return (pairsSet, totalHits)
def upconvertCoords(cigarPath, fastaPath, contigNum, outputFile): """Convert the coordinates of the given alignment, so that the alignment refers to a set of trimmed sequences originating from a contig rather than to the contig itself.""" with open(fastaPath) as f: seqRanges = getSequenceRanges(f) validateRanges(seqRanges) sortedCigarPath = sortCigarByContigAndPos(cigarPath, contigNum) sortedCigarFile = open(sortedCigarPath) currentContig = None currentRangeIdx = None currentRange = None for alignment in cigarRead(sortedCigarFile): # contig1 and contig2 are reversed in python api!! contig = alignment.contig2 if contigNum == 1 else alignment.contig1 minPos = min(alignment.start2, alignment.end2) if contigNum == 1 else min(alignment.start1, alignment.end1) maxPos = max(alignment.start2, alignment.end2) if contigNum == 1 else max(alignment.start1, alignment.end1) if contig in seqRanges: if contig != currentContig: currentContig = contig currentRangeIdx = 0 currentRange = seqRanges[contig][0] while (minPos >= currentRange[1] or minPos < currentRange[0]) and currentRangeIdx < len(seqRanges[contig]) - 1: currentRangeIdx += 1 currentRange = seqRanges[contig][currentRangeIdx] if currentRange[0] <= minPos < currentRange[1]: if maxPos - 1 > currentRange[1]: raise RuntimeError("alignment on %s:%d-%d crosses " "trimmed sequence boundary" %\ (contig, minPos, maxPos)) if contigNum == 1: alignment.start2 -= currentRange[0] alignment.end2 -= currentRange[0] alignment.contig2 = contig + ("|%d" % currentRange[0]) else: alignment.start1 -= currentRange[0] alignment.end1 -= currentRange[0] alignment.contig1 = contig + ("|%d" % currentRange[0]) else: raise RuntimeError("No trimmed sequence containing alignment " "on %s:%d-%d" % (contig, minPos, maxPos)) cigarWrite(outputFile, alignment, False) os.remove(sortedCigarPath)
def testCigarReadWrite(self): tempFile = getTempFile() self.tempFiles.append(tempFile) for test in range(0, self.testNo): cigarNumber = random.choice(range(10)) l = [getRandomPairwiseAlignment() for i in range(cigarNumber)] fileHandle = open(tempFile, 'w') for cigar in l: cigarWrite(fileHandle, cigar) fileHandle.close() fileHandle = open(tempFile, 'r') l.reverse() for cigar in cigarRead(fileHandle): cigarWrite(sys.stdout, l[-1]) cigarWrite(sys.stdout, cigar) assert cigar == l.pop() assert len(l) == 0 fileHandle.close()
def testCigarReadWrite(self): """Tests the C code for reading and writing cigars against the python parser for cigars. """ tempFile = getTempFile() self.tempFiles.append(tempFile) for test in range(0, self.testNo): pairwiseAlignmentNumber = random.choice(range(10)) l = [ getRandomPairwiseAlignment() for i in range(pairwiseAlignmentNumber) ] fileHandle = open(tempFile, 'w') keepProbs = random.random() > 0.5 if keepProbs == False: for pA in l: for op in pA.operationList: op.score = 0.0 for pairwiseAlignment in l: cigarWrite(fileHandle, pairwiseAlignment, keepProbs) fileHandle.close() #Now call sonLib_cigarsTest and read and write chains command = "sonLib_cigarTest %s %s" % (tempFile, keepProbs) #return system(command) #Now check the chain is okay fileHandle = open(tempFile, 'r') l.reverse() for pairwiseAlignment in cigarRead(fileHandle): pairwiseAlignment2 = l.pop() cigarWrite(sys.stdout, pairwiseAlignment, keepProbs) cigarWrite(sys.stdout, pairwiseAlignment2, keepProbs) assert pairwiseAlignment == pairwiseAlignment2 assert len(l) == 0 fileHandle.close()
def expectationMaximisation(target, sequences, alignments, outputModel, options): #Iteratively run cPecanRealign to get expectations and load model. if options.inputModel != None: #Read in the model target.logToMaster("Loading the model from the input file %s" % options.inputModel) hmm = Hmm.loadHmm(options.inputModel) target.logToMaster("Loaded the model, has type %s" % hmm.modelType) hmm.normalise() else: target.logToMaster("Making model of type %s" % options.modelType) hmm = Hmm(options.modelType) if options.randomStart: #Make random parameters target.logToMaster("Using random starting parameters") hmm.randomise() else: hmm.equalise() if options.setJukesCantorStartingEmissions != None: hmm.setEmissionsToJukesCantor( float(options.setJukesCantorStartingEmissions)) #Write out the first version of the output model hmm.write(outputModel) #Make a set of split alignment files alignmentsLength = 0 splitAlignmentFiles = [] fH = None for cigar in cigarRead(alignments): if fH == None: splitAlignmentFiles.append( os.path.join(target.getGlobalTempDir(), "alignments_%s.cigar" % len(splitAlignmentFiles))) fH = open(splitAlignmentFiles[-1], 'w') alignmentsLength += (abs(cigar.start1 - cigar.end1) + abs(cigar.start2 - cigar.end2)) / 2.0 cigarWrite(fH, cigar) if alignmentsLength > options.maxAlignmentLengthPerJob: fH.close() fH = None splitAlignmentFiles[-1] = (splitAlignmentFiles[-1], alignmentsLength) alignmentsLength = 0 if fH != None: fH.close() splitAlignmentFiles[-1] = (splitAlignmentFiles[-1], alignmentsLength) #Sample the alignment files so that we do EM on no more than options.maxAlignmentLengthToSample bases random.shuffle( splitAlignmentFiles ) #This ensures we don't just take the first N alignments in the provided alignments file sampledSplitAlignmentFiles = [] totalSampledAlignmentLength = 0.0 for alignmentsFile, alignmentsLength in splitAlignmentFiles: totalSampledAlignmentLength += alignmentsLength sampledSplitAlignmentFiles.append(alignmentsFile) if totalSampledAlignmentLength >= options.maxAlignmentLengthToSample: break target.logToMaster("We sampled: %s bases of alignment length and %s alignment files, of a possible %s base and %s files" % \ (totalSampledAlignmentLength, len(sampledSplitAlignmentFiles), sum(map(lambda x : x[1], splitAlignmentFiles)), len(splitAlignmentFiles))) splitAlignmentFiles = sampledSplitAlignmentFiles #Files to store expectations in expectationsFiles = map( lambda i: os.path.join(target.getGlobalTempDir(), "expectation_%i.txt" % i), xrange(len(splitAlignmentFiles))) assert len(splitAlignmentFiles) == len(expectationsFiles) target.setFollowOnTargetFn(expectationMaximisation2, args=(sequences, splitAlignmentFiles, outputModel, expectationsFiles, 0, [], options))