def getCactusInputs_randomWithConstraints(regionNumber=0, tempDir=None): sequenceDirs, newickTreeString = getCactusInputs_random(regionNumber=regionNumber, tempDir=tempDir) constraints = getTempFile(rootDir=tempDir) fileHandle = open(constraints, 'w') for pairwiseAlignment in makeRandomConstraints(getFastasFromSequence(sequenceDirs)): cigarWrite(fileHandle, pairwiseAlignment, withProbs=False) fileHandle.close() return sequenceDirs, newickTreeString, constraints
def expectationMaximisation(target, sequences, alignments, outputModel, options): #Iteratively run cPecanRealign to get expectations and load model. if options.inputModel != None: #Read in the model target.logToMaster("Loading the model from the input file %s" % options.inputModel) hmm = Hmm.loadHmm(options.inputModel) target.logToMaster("Loaded the model, has type %s" % hmm.modelType) hmm.normalise() else: target.logToMaster("Making model of type %s" % options.modelType) hmm = Hmm(options.modelType) if options.randomStart: #Make random parameters target.logToMaster("Using random starting parameters") hmm.randomise() else: hmm.equalise() if options.setJukesCantorStartingEmissions != None: hmm.setEmissionsToJukesCantor(float(options.setJukesCantorStartingEmissions)) #Write out the first version of the output model hmm.write(outputModel) #Make a set of split alignment files alignmentsLength = 0 splitAlignmentFiles = [] fH = None for cigar in cigarRead(alignments): if fH == None: splitAlignmentFiles.append(os.path.join(target.getGlobalTempDir(), "alignments_%s.cigar" % len(splitAlignmentFiles))) fH = open(splitAlignmentFiles[-1], 'w') alignmentsLength += (abs(cigar.start1 - cigar.end1) + abs(cigar.start2 - cigar.end2))/2.0 cigarWrite(fH, cigar) if alignmentsLength > options.maxAlignmentLengthPerJob: fH.close() fH = None splitAlignmentFiles[-1] = (splitAlignmentFiles[-1], alignmentsLength) alignmentsLength = 0 if fH != None: fH.close() splitAlignmentFiles[-1] = (splitAlignmentFiles[-1], alignmentsLength) #Sample the alignment files so that we do EM on no more than options.maxAlignmentLengthToSample bases random.shuffle(splitAlignmentFiles) #This ensures we don't just take the first N alignments in the provided alignments file sampledSplitAlignmentFiles = [] totalSampledAlignmentLength = 0.0 for alignmentsFile, alignmentsLength in splitAlignmentFiles: totalSampledAlignmentLength += alignmentsLength sampledSplitAlignmentFiles.append(alignmentsFile) if totalSampledAlignmentLength >= options.maxAlignmentLengthToSample: break target.logToMaster("We sampled: %s bases of alignment length and %s alignment files, of a possible %s base and %s files" % \ (totalSampledAlignmentLength, len(sampledSplitAlignmentFiles), sum(map(lambda x : x[1], splitAlignmentFiles)), len(splitAlignmentFiles))) splitAlignmentFiles = sampledSplitAlignmentFiles #Files to store expectations in expectationsFiles = map(lambda i : os.path.join(target.getGlobalTempDir(), "expectation_%i.txt" % i), xrange(len(splitAlignmentFiles))) assert len(splitAlignmentFiles) == len(expectationsFiles) target.setFollowOnTargetFn(expectationMaximisation2, args=(sequences, splitAlignmentFiles, outputModel, expectationsFiles, 0, [], options))
def upconvertCoords(cigarPath, fastaPath, contigNum, outputFile): """Convert the coordinates of the given alignment, so that the alignment refers to a set of trimmed sequences originating from a contig rather than to the contig itself.""" with open(fastaPath) as f: seqRanges = getSequenceRanges(f) validateRanges(seqRanges) sortedCigarPath = sortCigarByContigAndPos(cigarPath, contigNum) sortedCigarFile = open(sortedCigarPath) currentContig = None currentRangeIdx = None currentRange = None for alignment in cigarRead(sortedCigarFile): # contig1 and contig2 are reversed in python api!! contig = alignment.contig2 if contigNum == 1 else alignment.contig1 minPos = min(alignment.start2, alignment.end2) if contigNum == 1 else min( alignment.start1, alignment.end1) maxPos = max(alignment.start2, alignment.end2) if contigNum == 1 else max( alignment.start1, alignment.end1) if contig in seqRanges: if contig != currentContig: currentContig = contig currentRangeIdx = 0 currentRange = seqRanges[contig][0] while (minPos >= currentRange[1] or minPos < currentRange[0] ) and currentRangeIdx < len(seqRanges[contig]) - 1: currentRangeIdx += 1 currentRange = seqRanges[contig][currentRangeIdx] if currentRange[0] <= minPos < currentRange[1]: if maxPos - 1 > currentRange[1]: raise RuntimeError("alignment on %s:%d-%d crosses " "trimmed sequence boundary" %\ (contig, minPos, maxPos)) if contigNum == 1: alignment.start2 -= currentRange[0] alignment.end2 -= currentRange[0] alignment.contig2 = contig + ("|%d" % currentRange[0]) else: alignment.start1 -= currentRange[0] alignment.end1 -= currentRange[0] alignment.contig1 = contig + ("|%d" % currentRange[0]) else: raise RuntimeError("No trimmed sequence containing alignment " "on %s:%d-%d" % (contig, minPos, maxPos)) cigarWrite(outputFile, alignment, False) os.remove(sortedCigarPath)
def upconvertCoords(cigarPath, fastaPath, contigNum, outputFile): """Convert the coordinates of the given alignment, so that the alignment refers to a set of trimmed sequences originating from a contig rather than to the contig itself.""" with open(fastaPath) as f: seqRanges = getSequenceRanges(f) validateRanges(seqRanges) sortedCigarPath = sortCigarByContigAndPos(cigarPath, contigNum) sortedCigarFile = open(sortedCigarPath) currentContig = None currentRangeIdx = None currentRange = None for alignment in cigarRead(sortedCigarFile): # contig1 and contig2 are reversed in python api!! contig = alignment.contig2 if contigNum == 1 else alignment.contig1 minPos = min(alignment.start2, alignment.end2) if contigNum == 1 else min(alignment.start1, alignment.end1) maxPos = max(alignment.start2, alignment.end2) if contigNum == 1 else max(alignment.start1, alignment.end1) if contig in seqRanges: if contig != currentContig: currentContig = contig currentRangeIdx = 0 currentRange = seqRanges[contig][0] while (minPos >= currentRange[1] or minPos < currentRange[0]) and currentRangeIdx < len(seqRanges[contig]) - 1: currentRangeIdx += 1 currentRange = seqRanges[contig][currentRangeIdx] if currentRange[0] <= minPos < currentRange[1]: if maxPos - 1 > currentRange[1]: raise RuntimeError("alignment on %s:%d-%d crosses " "trimmed sequence boundary" %\ (contig, minPos, maxPos)) if contigNum == 1: alignment.start2 -= currentRange[0] alignment.end2 -= currentRange[0] alignment.contig2 = contig + ("|%d" % currentRange[0]) else: alignment.start1 -= currentRange[0] alignment.end1 -= currentRange[0] alignment.contig1 = contig + ("|%d" % currentRange[0]) else: raise RuntimeError("No trimmed sequence containing alignment " "on %s:%d-%d" % (contig, minPos, maxPos)) cigarWrite(outputFile, alignment, False) os.remove(sortedCigarPath)
def testCigarReadWrite(self): tempFile = getTempFile() self.tempFiles.append(tempFile) for test in range(0, self.testNo): cigarNumber = random.choice(range(10)) l = [getRandomPairwiseAlignment() for i in range(cigarNumber)] fileHandle = open(tempFile, 'w') for cigar in l: cigarWrite(fileHandle, cigar) fileHandle.close() fileHandle = open(tempFile, 'r') l.reverse() for cigar in cigarRead(fileHandle): cigarWrite(sys.stdout, l[-1]) cigarWrite(sys.stdout, cigar) assert cigar == l.pop() assert len(l) == 0 fileHandle.close()
def testCigarReadWrite(self): """Tests the C code for reading and writing cigars against the python parser for cigars. """ tempFile = getTempFile() self.tempFiles.append(tempFile) for test in range(0, self.testNo): pairwiseAlignmentNumber = random.choice(range(10)) l = [ getRandomPairwiseAlignment() for i in range(pairwiseAlignmentNumber) ] fileHandle = open(tempFile, 'w') keepProbs = random.random() > 0.5 if keepProbs == False: for pA in l: for op in pA.operationList: op.score = 0.0 for pairwiseAlignment in l: cigarWrite(fileHandle, pairwiseAlignment, keepProbs) fileHandle.close() #Now call sonLib_cigarsTest and read and write chains command = "sonLib_cigarTest %s %s" % (tempFile, keepProbs) #return system(command) #Now check the chain is okay fileHandle = open(tempFile, 'r') l.reverse() for pairwiseAlignment in cigarRead(fileHandle): pairwiseAlignment2 = l.pop() cigarWrite(sys.stdout, pairwiseAlignment, keepProbs) cigarWrite(sys.stdout, pairwiseAlignment2, keepProbs) assert pairwiseAlignment == pairwiseAlignment2 assert len(l) == 0 fileHandle.close()
def expectationMaximisation(target, sequences, alignments, outputModel, options): #Iteratively run cPecanRealign to get expectations and load model. if options.inputModel != None: #Read in the model target.logToMaster("Loading the model from the input file %s" % options.inputModel) hmm = Hmm.loadHmm(options.inputModel) target.logToMaster("Loaded the model, has type %s" % hmm.modelType) hmm.normalise() else: target.logToMaster("Making model of type %s" % options.modelType) hmm = Hmm(options.modelType) if options.randomStart: #Make random parameters target.logToMaster("Using random starting parameters") hmm.randomise() else: hmm.equalise() if options.setJukesCantorStartingEmissions != None: hmm.setEmissionsToJukesCantor( float(options.setJukesCantorStartingEmissions)) #Write out the first version of the output model hmm.write(outputModel) #Make a set of split alignment files alignmentsLength = 0 splitAlignmentFiles = [] fH = None for cigar in cigarRead(alignments): if fH == None: splitAlignmentFiles.append( os.path.join(target.getGlobalTempDir(), "alignments_%s.cigar" % len(splitAlignmentFiles))) fH = open(splitAlignmentFiles[-1], 'w') alignmentsLength += (abs(cigar.start1 - cigar.end1) + abs(cigar.start2 - cigar.end2)) / 2.0 cigarWrite(fH, cigar) if alignmentsLength > options.maxAlignmentLengthPerJob: fH.close() fH = None splitAlignmentFiles[-1] = (splitAlignmentFiles[-1], alignmentsLength) alignmentsLength = 0 if fH != None: fH.close() splitAlignmentFiles[-1] = (splitAlignmentFiles[-1], alignmentsLength) #Sample the alignment files so that we do EM on no more than options.maxAlignmentLengthToSample bases random.shuffle( splitAlignmentFiles ) #This ensures we don't just take the first N alignments in the provided alignments file sampledSplitAlignmentFiles = [] totalSampledAlignmentLength = 0.0 for alignmentsFile, alignmentsLength in splitAlignmentFiles: totalSampledAlignmentLength += alignmentsLength sampledSplitAlignmentFiles.append(alignmentsFile) if totalSampledAlignmentLength >= options.maxAlignmentLengthToSample: break target.logToMaster("We sampled: %s bases of alignment length and %s alignment files, of a possible %s base and %s files" % \ (totalSampledAlignmentLength, len(sampledSplitAlignmentFiles), sum(map(lambda x : x[1], splitAlignmentFiles)), len(splitAlignmentFiles))) splitAlignmentFiles = sampledSplitAlignmentFiles #Files to store expectations in expectationsFiles = map( lambda i: os.path.join(target.getGlobalTempDir(), "expectation_%i.txt" % i), xrange(len(splitAlignmentFiles))) assert len(splitAlignmentFiles) == len(expectationsFiles) target.setFollowOnTargetFn(expectationMaximisation2, args=(sequences, splitAlignmentFiles, outputModel, expectationsFiles, 0, [], options))