def get_zygosity(sample, bwtfile, threshold, expected_length, ref_te): msbwt = MultiStringBWT.loadBWT(bwtfile, useMemmap=False) oc_length = 25 ed_th = .2 * oc_length kmer_list = get_kmers(sample, ref_te) zygosity_data = [] zygo_dict = {} other_context = {} for TEi_id, my_id, side, chromo, pos, strand, ref_te, context, TE, TSD in kmer_list: #if (TEi_id, ref_te) != (47,0): # continue #print TEi_id,my_id,side,chromo,pos,strand,ref_te,context,TE,TSD new_context = MultiStringBWT.reverseComplement( context) if side == 'start' else context lo, hi = msbwt.findIndicesOfStr(new_context) context_List = set() context_List = growContext(msbwt, lo, hi, '', oc_length, context_List) zygo_dict[TEi_id] = zygo_dict.get(TEi_id, set([1])) for oc in context_List: oc = MultiStringBWT.reverseComplement( oc) if side == 'start' else oc ed = lv.distance( oc, TE[:oc_length]) if side == 'start' else lv.distance( oc, TE[-oc_length:]) TE = TE[:oc_length] if side == 'start' else TE[-oc_length:] if ed > ed_th: zygo_dict[TEi_id] = zygo_dict[TEi_id] | set([0]) other_context[TEi_id] = oc zygosity_data.append([ TEi_id, my_id, side, chromo, pos, strand, ref_te, context, TE, TSD ]) individual_file = "IndividualAnalysis/%s_%s.csv" % (sample, ref_te) zygosity_data.sort(key=lambda x: (x[3], x[4])) header = [ 'TEi_id', 'my_id', 'side', 'chromo', 'pos', 'strand', 'ref_te', 'context', 'TE', 'TSD', 'other_context', 'zygosity' ] with open(individual_file, 'wb') as fp: a = csv.writer(fp, delimiter=',') a.writerows([header]) for d in zygosity_data: zygosity = 'heterozygous' if len( zygo_dict[d[0]]) == 2 else 'homozygous' d.append(other_context.get(d[0], '')) d.append(zygosity) a.writerows([d]) print "Wrote file: %s [%d lines]" % (individual_file, len(zygosity_data))
def runQuery(**kwargs): pieces = kwargs["dataset"].split('-') directory = MSBWTdirs[int(pieces[0])] + '/' + '-'.join(pieces[1:]) # load the MSBWT msbwt = MSBWT.loadBWT(directory) if kwargs['forward'] == "true": forwardResults = [ msbwt.countOccurrencesOfSeq(str(kmer)) for kmer in kwargs['kmerQueries'] ] else: forwardResults = [] if kwargs['revComp'] == "true": rcResults = [ msbwt.countOccurrencesOfSeq(MSBWT.reverseComplement(str(kmer))) for kmer in kwargs['kmerQueries'] ] else: rcResults = [] return [forwardResults, rcResults]
def _load_bwts(self, bwt_dirs): msbwt = {} for ff in bwt_dirs: if not io.readable_dir(ff): continue name = os.path.basename(ff.rstrip("/")) msbwt.update({name: ms.loadBWT(ff)}) if len(msbwt): return msbwt else: return None
def _load_bwts(self, bwt_dirs): msbwt = {} for ff in bwt_dirs: if not io.readable_dir(ff): continue name = os.path.basename(ff.rstrip("/")) msbwt.update( { name: ms.loadBWT(ff) } ) if len(msbwt): return msbwt else: return None
def checkAlive(): names = [] for filename in os.listdir(app.config['BWT_ROOT']): try: bwt = MSBWT.loadBWT(app.config['BWT_ROOT'] + filename) if bwt.countOccurrencesOfSeq('T'.encode('utf-8', 'ignore')) > 0: names.append(filename.decode('utf-8')) else: continue except Exception as e: print(e) continue data = {"names": names} return Response(json.dumps(data), status=200)
def find_Kmer(Kmer): outf = open(outdir + 'Tumor_kmers_{}.txt'.format(Kmer), 'w') msbwt_tumor = MSBWT.loadBWT(args.tumor_bwt) msbwt_normal = MSBWT.loadBWT(args.normal_bwt) logging.info("finished loading BWTs") tLow, tHigh = msbwt_tumor.findIndicesOfStr(Kmer) nLow, nHigh = msbwt_normal.findIndicesOfStr(Kmer) def Kmer_count(tLow, tHigh, nLow, nHigh, Kmer, outf): tLow, tHigh = msbwt_tumor.findIndicesOfStr(Kmer[0],(tLow, tHigh)) nLow, nHigh = msbwt_normal.findIndicesOfStr(Kmer[0],(nLow, nHigh)) tumor_count = tHigh - tLow normal_count = nHigh - nLow if tumor_count > tumor_threshold and normal_count < normal_threshold: outf.write(Kmer + '\t' + str(tumor_count) + '\t' + str(normal_count) + '\t' + Kmer + '\n') return elif tumor_count <= tumor_threshold or len(Kmer) == read_length: return for nucleotide in nucleotide_list: Kmer = nucleotide + Kmer Kmer_count(tLow, tHigh, nLow, nHigh, Kmer, outf) Kmer = Kmer[1:] for nucleotide in nucleotide_list: Kmer = nucleotide + Kmer Kmer_count(tLow, tHigh, nLow, nHigh, Kmer, outf) Kmer = Kmer[1:] outf.close()
def load_bwts(bwt_dirs): msbwt = [] for ff in bwt_dirs: if not io.readable_dir(ff): continue try: msbwt.append(ms.loadBWT(ff)) except Exception as e: sys.stderr.write("Couldn't load BWT at <{}>\n".format(ff)) sys.stderr.write(str(e)) if len(msbwt): return msbwt else: return None
def load_bwts(bwt_dirs): msbwt = [] for ff in bwt_dirs: if not io.readable_dir(ff): continue try: msbwt.append( ms.loadBWT(ff) ) except Exception as e: sys.stderr.write("Couldn't load BWT at <{}>\n".format(ff)) sys.stderr.write(str(e)) if len(msbwt): return msbwt else: return None
def loadBWT(name, forceLocal=False): logIt("Loading %s...\n" % name) if not forceLocal: try: logIt("Trying remote source...\n") remoteSource = findRemote(name) return CloudBwt(name, remoteSource) except Exception as e: logIt(" Failed\n" + e.message) pass try: localSource = findLocal(name) return MSBWT.loadBWT(localSource) except Exception as e: return None
def functionCaller(name, func_call): if DEBUG: print("Serving {}".format(name)) bwt = MSBWT.loadBWT(app.config['BWT_ROOT'] + name.encode('utf-8', 'ignore') + '/'.encode('utf-8', 'ignore')) args = ast.literal_eval(request.args.get('args', None)) kwargs = request.args.get('kwargs', None) async_flag = request.args.get('async', None) if args is None: return Response(status=400) if kwargs is not None: kwargs = ast.literal_eval(kwargs) else: kwargs = {} #Legacy Compatibility, disable non-blocking functionality if async_flag is None or async_flag.lower() == 'false': ar = [func_call, args, kwargs, bwt] r = executor.submit(_runLegacy, *ar) return Response(json.dumps({'result': r.result()}), status=200) tok = getToken() st = 405 try: results_lst[tok] = {} ar = [func_call, args, kwargs, bwt, tok] executor.submit(_run, *ar) results_lst[tok]['func'] = func_call results_lst[tok]['args'] = args results_lst[tok]['kwargs'] = kwargs st = 200 except: st = 405 summary = { 'data': app.config['DATA'], 'name': name, 'token': tok, 'function': func_call, 'args': args, 'kwargs': kwargs } return Response(json.dumps(summary), status=st)
def api_search(): sequence = str(request.values['sequence']) sample = str(request.values['sample']) count = 0 print('submit called with: {} ... {}'.format(sequence, sample)) bwt_dir = os.environ.get('BWT_DIR', None) sample_dir = os.path.join(bwt_dir, sample) print('sample_dir=', sample_dir) msbwt = MultiStringBWT.loadBWT(sample_dir) print('msbwt=', msbwt) count = msbwt.countOccurrencesOfSeq(sequence) return jsonify({ 'sequence': str(sequence), 'sample': str(sample), 'count': count, })
def firstTimeExtension(self, foundKmers, unexploredPaths, nodes, edges): ''' @param foundKmers - Each kmer we find will be checked against this and added if not present @param unexploredPaths - if we find a new path split, we add the things here, also merges are important to add here @param nodes - the list of nodes if we find a new one ''' pc = '' kmer = self.seq terminate = False while not terminate: if len(kmer) != self.pathK: print('ERROR: DIFFERENT SIZED K-MER ' + str(len(kmer))) raise Exception('ERROR') #First, perform all the counts of paths going both forwards and backwards counts = {} revCounts = {} #maxV - the count of the (k+1)-mer with maxC on it, total is the total counts of valid chars maxV = 0 maxC = '' total = 0 #count the number of forward and reversed paths numPaths = 0 numRevPaths = 0 for c in self.validChars: counts[c] = self.msbwt.countOccurrencesOfSeq( kmer + c) + self.msbwt.countOccurrencesOfSeq( MultiStringBWT.reverseComplement(kmer + c)) revCounts[c] = self.msbwt.countOccurrencesOfSeq( c + kmer) + self.msbwt.countOccurrencesOfSeq( MultiStringBWT.reverseComplement(c + kmer)) if self.drawDollarTerminals or c != '$': total += counts[c] if counts[c] > maxV: maxV = counts[c] maxC = c if counts[c] >= self.pathThreshold: numPaths += 1 #if we have evidence from the counts OR if the previous character was known to be that character if revCounts[c] >= self.pathThreshold or c == pc: numRevPaths += 1 #check if we have incoming edges, in which case we need to end this block if numRevPaths > 1 and kmer != self.seq: #remove the last kmer, because it's actually in the new node we merge into self.seq = self.seq[0:-1] #this will lead to repeating the same counts later, but that's okay newID = len(nodes) newHistMers = set([]) nodes.append( PathNode(newID, kmer, self.msbwt, self.minDistToSeed + len(self.pileups), self.settingsDict)) edges.append( PathEdge(len(edges), self.nodeID, newID, revCounts[pc], pc + ', ' + str(revCounts))) self.termCondition = 'MERGE_' + str(newID) foundKmers[kmer] = newID unexploredPaths.append(nodes[newID]) #print 'Ending block for merge' terminate = True elif total == 0: #print 'No strings found.' self.termCondition = 'TERMINAL' terminate = True else: #the kmer was found in this block and it may have multiple extensions foundKmers[kmer] = self.nodeID revMer = MultiStringBWT.reverseComplement(kmer) if foundKmers.has_key(revMer): otherID = foundKmers[revMer] self.inversionSet.add(otherID) nodes[otherID].inversionSet.add(self.nodeID) r1 = self.msbwt.findIndicesOfStr(kmer[-self.countK:]) r2 = self.msbwt.findIndicesOfStr( MultiStringBWT.reverseComplement(kmer[-self.countK:])) kmerCount = (r1[1] - r1[0]) + (r2[1] - r2[0]) self.pileups.append(kmerCount) perc = float(maxV) / total if self.trackReads == True: for i in range(r1[0], r1[1]): self.readSet.add( (int(self.msbwt.getSequenceDollarID(i)), 0)) for i in range(r2[0], r2[1]): self.readSet.add( (int(self.msbwt.getSequenceDollarID(i)), 1)) #if kmerCount > self.overloadThreshold: if self.pileups[0] > self.overloadThreshold: #this path is too heavy, we probably won't figure out what's going on downstream self.termCondition = 'OVERLOAD' terminate = True elif numPaths > 1: self.termCondition = 'SPLIT' for c in self.validChars: if counts[c] >= self.pathThreshold: newKmer = kmer[1:] + c if foundKmers.has_key(newKmer): otherNID = foundKmers[newKmer] nodes[otherNID].minDistToSeed = min( nodes[otherNID].minDistToSeed, self.minDistToSeed + len(self.pileups)) edges.append( PathEdge(len(edges), self.nodeID, otherNID, counts[c], c + ': ' + str(counts[c]))) else: if self.drawDollarTerminals or c != '$': newID = len(nodes) newHistMers = set([]) nodes.append( PathNode( newID, newKmer, self.msbwt, self.minDistToSeed + len(self.pileups), self.settingsDict)) edges.append( PathEdge(len(edges), self.nodeID, newID, counts[c], c + ': ' + str(counts[c]))) foundKmers[newKmer] = newID if c != '$': unexploredPaths.append(nodes[newID]) else: nodes[newID].termCondition = '$ Ext' terminate = True else: #this is data pertaining to this k-mer #print ':\t'+kmer+maxC+'\t'+str(perc)+'\t'+str(maxV)+'/'+str(total)+'\t'+str(total-maxV)+'\t' pc = kmer[0] kmer = kmer[1:] + maxC #check if we've found the new k-mer before if foundKmers.has_key(kmer): otherNID = foundKmers[kmer] nodes[otherNID].minDistToSeed = min( nodes[otherNID].minDistToSeed, self.minDistToSeed + len(self.pileups)) if counts[maxC] >= self.pathThreshold: edges.append( PathEdge(len(edges), self.nodeID, otherNID, counts[maxC], pc + ': ' + str(counts[maxC]))) self.termCondition = 'MERGE_' + str(otherNID) else: edges.append( PathEdge(len(edges), self.nodeID, otherNID, counts[maxC], pc + ': ' + str(counts[maxC]), 'dashed')) self.termCondition = 'MERGE_' + str( otherNID) + ', THRESHOLD' terminate = True else: self.seq += maxC if maxC == '$': self.termCondition = '$ Max' terminate = True
def build_bridge(msbwt, seedKmer, targetKmer, tMin=1, branchLim=10, maxBranchLen=250): """ Assemble the short "bridge" between two sequences expected to occur nearby on the template. @param msbwt - the MSBWT to use for searchs @param seedKmer - a k-mer to seed our bridging @param targetKmer - the target we are trying to bridge to @param tMin - the minimum k-count needed to consider the path @param branchLim - the maximum number of branches we will test @param maxBranchLen - the maximum length of a branch before giving up @return (ret, numBranched) ret - a list of bridges discovered; for most cases this is a list of length one numBranched - the number of branches we explored; if numBranched >= branchLim, this function was not 100% exhaustive """ #initialize to our input kmer ret = [] possBridges = [dna.unmask(dna.ungap(seedKmer))] targetKmer = dna.unmask(dna.ungap(targetKmer)) kmerLen = len(seedKmer) #set up some easy values validChars = "ACGT" counts = np.zeros(dtype='<u8', shape=(len(validChars), )) numBranched = 0 #print (seedKmer, targetKmer) #while we have things to explore, and we haven't explored too many, and we don't have a ridiculous number of possibilities while len(possBridges) > 0 and numBranched < branchLim: #get the bridge, the kmer, and the reverse kmer currBridge = possBridges.pop() numBranched += 1 currKmer = currBridge[len(currBridge) - kmerLen:] revKmer = MultiStringBWT.reverseComplement(currKmer) #try to extend it on out while len(currBridge) < maxBranchLen: #get the counts for each possible extension for i, c in enumerate(validChars): counts[i] = msbwt.countOccurrencesOfSeq( currKmer + c) + msbwt.countOccurrencesOfSeq(dna.revcomp(c) + revKmer) #get the highest one maxPos = np.argmax(counts) maxSym = validChars[maxPos] #make sure the highest is high enough for us to consider it if counts[maxPos] >= tMin: if len(possBridges) < branchLim: #go through all the other possible extensions for i, c in enumerate(validChars): if i != maxPos and counts[i] >= tMin: #add the ones we aren't exploring right now if they're high enough possBridges.append(currBridge + c) #make sure the highest isn't too high #this extension meets our requirement so shift over to loop back around currBridge += maxSym currKmer = currKmer[1:] + maxSym revKmer = dna.revcomp(maxSym) + revKmer[0:len(revKmer) - 1] else: #our BEST doesn't pass the threshold on this path, stop following break if currKmer.startswith(targetKmer): ret.append(currBridge) #return all our possibilities return (ret, numBranched)
def extendSeed(self, seedKmer, endSeeds): ''' This function is intended to be an interactive technique for constructing transcripts, probably to be released in a future version of msbwt @param bwtFN - the filename of the BWT to load @param seedKmer - the seed sequence to use for construction @param threshold - minimum number for a path to be considered a path @param direction - True is forward, False is backward @param logger - the logger @param ''' if self.foundKmers.has_key(seedKmer): return pathK = self.settingsDict.get('kmerSize', len(seedKmer)) countK = self.settingsDict.get('countK', pathK) isMerged = self.settingsDict.get('isMerged', False) trackPairs = self.settingsDict.get('trackPairs', False) trackReads = self.settingsDict.get('trackReads', False) useMemmap = self.settingsDict.get('useMemmap', True) maxDistance = self.settingsDict.get('maxDistance', 0xFFFFFFFF) if len(seedKmer) != pathK: raise Exception('Seed k-mer incorrect length') numNodes = self.settingsDict['numNodes'] validChars = ['$', 'A', 'C', 'G', 'N', 'T'] if self.logger != None: self.logger.info('Loading ' + self.bwtDir + '...') msbwt = MultiStringBWT.loadBWT(self.bwtDir, useMemmap, self.logger) if os.path.exists(self.bwtDir + '/origins.npy'): raise Exception( "You haven\'t reimplemented the handling of origin files") origins = np.load(self.bwtDir + '/origins.npy', 'r') else: origins = None self.settingsDict['interleaveFN'] = self.bwtDir + '/inter0.npy' kmer = seedKmer firstID = len(self.nodes) self.nodes.append(PathNode(firstID, kmer, msbwt, 0, self.settingsDict)) self.foundKmers[kmer] = firstID for i, endSeed in enumerate(endSeeds): if len(endSeed) != pathK: raise Exception(endSeed + ': NOT CORRECT LENGTH') else: endID = len(self.nodes) self.nodes.append( PathNode(endID, endSeed, msbwt, 0, self.settingsDict)) self.nodes[endID].termCondition = 'END_SEED_' + str(i) self.foundKmers[endSeed] = endID if self.logger != None: self.logger.info('Beginning with seed \'' + seedKmer + '\', pathK=' + str(pathK) + ', countK=' + str(countK)) unexploredPaths = [self.nodes[firstID]] #init the kmer dictionary execID = firstID while len(unexploredPaths) > 0: #uncomment to make this smallest first unexploredPaths.sort(key=lambda node: node.minDistToSeed) #print 'UP: '+'['+','.join([str((node.minDistToSeed, node.nodeID)) for node in unexploredPaths])+']' nextNode = unexploredPaths.pop(0) if nextNode.nodeID >= numNodes: nextNode.termCondition = 'UNEXPLORED_NODE' elif nextNode.minDistToSeed >= maxDistance: nextNode.termCondition = 'UNEXPLORED_DIST' else: nextNode.execOrder = execID execID += 1 if self.logger != None: self.logger.info('Exploring new node') nextNode.firstTimeExtension(self.foundKmers, unexploredPaths, self.nodes, self.edges) if isMerged and trackReads: interleaveFN = self.bwtDir + '/inter0.npy' interleave = np.load(interleaveFN, 'r') #we only need to do this for newly processed nodes for node in self.nodes[firstID:]: dIDs = node.readSet for dID in dIDs: sourceID = interleave[dID[0]] node.sourceCounts[sourceID] = node.sourceCounts.get( sourceID, 0) + 1 if trackPairs: abtFN = self.bwtDir + '/abt.npy' abt = np.load(abtFN, 'r') #abtDict = {} #only need to process new nodes for node in self.nodes[firstID:]: dIDs = node.readSet for dID, direction in dIDs: (fID, rID) = abt[dID] if fID % 2 == 0: oFID = fID + 1 else: oFID = fID - 1 if self.abtDict.has_key((oFID, rID, 1 - direction)): otherNIDs = self.abtDict[(oFID, rID, 1 - direction)][1] for n in otherNIDs: self.nodes[n].pairedNodes[ node.nodeID] = self.nodes[n].pairedNodes.get( node.nodeID, 0) + 1 node.pairedNodes[n] = node.pairedNodes.get(n, 0) + 1 if not self.abtDict.has_key((fID, rID, direction)): self.abtDict[(fID, rID, direction)] = (dID, set([])) self.abtDict[(fID, rID, direction)][1].add(node.nodeID)
def SearchResponse(form): panel = markup.page() panel.script(type="text/javascript") panel.add(""" function getSelectedText() { var hidden, submit; var selectedText=(window.getSelection ? window.getSelection() : document.getSelection ? document.getSelection() : document.selection.createRange().text); if (selectedText == "") { alert("You must select a subsequence"); return false; } else { document.forms["SearchSelected"]["pattern"].value = selectedText; } } """) panel.script.close() panel.div(style="padding:50px 50px;") datasets = form.getvalue("dataset") if (datasets == None): panel.h3("ERROR: No datasets selected.") panel.div(align="center", style="padding: 30px 30px;") panel.input(type="button", value="New Search", onClick='self.location="./?run=msAllele"') panel.div.close() panel.div.close() return panel if isinstance(datasets, str): datasets = [datasets] pattern = form.getvalue("pattern") if (pattern == None): panel.h3("ERROR: No search pattern specified") panel.div(align="center", style="padding: 30px 30px;") panel.input(type="button", value="New Search", onClick='self.location="./?run=msAllele"') panel.div.close() panel.div.close() return panel pattern = pattern.upper() for dataset in datasets: panel.h3(dataset) bwtDirName = "%s/%s" % (MSBWTdir, dataset) filestat = os.stat(bwtDirName+"/comp_msbwt.npy") filesize = locale.format("%d", filestat.st_size, grouping=True) bwt = MultiStringBWT.loadBWT(bwtDirName) stringCount = locale.format("%d", bwt.getSymbolCount(0), grouping=True) baseCount = locale.format("%d", bwt.getTotalSize(), grouping=True) bitsPerBase = (8.0*filestat.st_size)/bwt.getTotalSize() panel.strong("%s: %s strings with %s bases and index size of %s bytes (%3.2f bits per base)<br />" % (dataset, stringCount, baseCount, filesize, bitsPerBase)) panel.strong("Target: %s<br />" % (pattern)) lo1, hi1 = bwt.findIndicesOfStr(pattern) lo2, hi2 = bwt.findIndicesOfStr(revComp(pattern)) count = hi1 - lo1 + hi2 - lo2 if (count > 10000): panel.add("Found %d times (%d forward, %d reverse-complemented)<br /><br />" % (count, hi1-lo1, hi2-lo2)) panel.span("Too much data!", style="font-size: 180%;") elif count > 0: ''' l = len(pattern) bufferLen = 101 fixedSize = 2*bufferLen-l readlist = [] for i in xrange(lo1, hi1): #pass suffix = bwt.recoverString(i) suffLen = len(suffix) end = suffix.find('$') beforePattern = suffLen-end-1 read = ('.'*(bufferLen-l-beforePattern)+ suffix[end+1:].lower()+ suffix[:l]+ suffix[l:end+1].lower()) read += '.'*(fixedSize-len(read)) readlist.append(read) for i in xrange(lo2, hi2): suffix = revComp(bwt.recoverString(i)) suffLen = len(suffix) end = suffix.find('$') beforePattern = suffLen-end-l read = ('.'*(bufferLen-l-beforePattern)+ suffix[end:-l].lower()+ suffix[-l:]+ suffix[:end].lower()) read += '.'*(fixedSize-len(read)) readlist.append(read) ''' panel.add("Found %d times (%d forward, %d reverse-complemented)<br /><br />" % (count, hi1-lo1, hi2-lo2)) panel.div(style="font-size:10px; font-family: monospace;") #margin = len(suffix)-l l = len(pattern) margin = 101-l haps = extractHaplotypes(bwt, pattern) if len(haps) > 0: consensusMain = (sorted(haps, key=lambda x: x[2][0][1], reverse=True))[0][0] panel.table(border='1') panel.tr() panel.th('Consensus') panel.th('Exact matches') panel.tr.close() extrasList = [] for consensus, readlist, counts in sorted(haps, key=lambda x: x[2][0][1], reverse=True): #panel.strong('%s<span style="color: green;">%s</span>%s<br />' % (consensus[:margin].upper(), consensus[margin:margin+l].upper(), consensus[margin+l:].upper())) if counts[0][1] > 0: panel.tr() panel.td() panel.strong() output = "" for i, base in enumerate(consensus): if i == margin: output += '<span style="color: green;">' elif i == margin+l: output += '</span>' if(base != '$') and (base != '.') and (consensus[i] != '.') and (base.upper() != consensusMain[i].upper()): output += '<span style="background-color:yellow;">%s</span>' % base.upper() else: output += base.upper() panel.add(output) panel.strong.close() panel.td.close() panel.td(str(counts[0][1])) panel.tr.close() for read in readlist[counts[0][1]:]: extrasList.append(read) if len(extrasList) > 0: consensus, dummyVar = conSeq(extrasList) panel.tr() panel.td() panel.strong() output = "" for i, base in enumerate(consensus): if i == margin: output += '<span style="color: green;">' elif i == margin+l: output += '</span>' if(base != '$') and (base != '.') and (consensus[i] != '.') and (base.upper() != consensusMain[i].upper()): output += '<span style="background-color:yellow;">%s</span>' % base.upper() else: output += base.upper() panel.add(output) panel.strong.close() panel.td.close() panel.td('0') panel.tr.close() panel.table.close() for consensus, readlist, counts in sorted(haps, key=lambda x: x[2][0][1], reverse=True): #consensus = conSeq(readlist) #panel.add(consensus) #readlist.sort(cmp=readCmp) if counts[0][1] == 0: continue read = "."*margin + "*"*l + '.'*margin panel.add(read) panel.br() for read in readlist[0:counts[0][1]]: color = "red" if (read.find('$') > read.find(pattern)) else "blue" output = "" for i, base in enumerate(read): if (i == margin): output += '<span style="color: %s;">' % color elif (i == margin+l): output += '</span>' if (base != '$') and (base != '.') and (consensus[i] != '.') and (base.upper() != consensus[i].upper()): output += '<span style="background-color:yellow;">%s</span>' % base else: output += base output += '<br />' panel.add(output) panel.strong('%s<span style="color: green;">%s</span>%s<br />' % (consensus[:margin], consensus[margin:margin+l], consensus[margin+l:])) panel.br() panel.br() if len(extrasList) > 0: consensus, dummyVar = conSeq(extrasList) panel.add(consensus) extrasList.sort(cmp=readCmp) read = "."*margin + "*"*l + '.'*margin panel.add(read) panel.br() for read in extrasList: color = "red" if (read.find('$') > read.find(pattern)) else "blue" output = "" for i, base in enumerate(read): if (i == margin): output += '<span style="color: %s;">' % color elif (i == margin+l): output += '</span>' if (base != '$') and (base != '.') and (consensus[i] != '.') and (base.upper() != consensus[i].upper()): output += '<span style="background-color:yellow;">%s</span>' % base else: output += base output += '<br />' panel.add(output) panel.strong('%s<span style="color: green;">%s</span>%s<br />' % (consensus[:margin], consensus[margin:margin+l], consensus[margin+l:])) panel.br() panel.div.close() else: panel.add("Pattern not found<br /><br />") panel.form(action="", name="SearchSelected", method="POST", enctype="multipart/form-data", onsubmit='return getSelectedText()') panel.div(align="center", style="padding: 30px 30px;") panel.input(type="submit", name="submit", value="Search Selected") panel.input(type="button", value="New Search", onClick='self.location="./?run=msAllele"') for dataset in datasets: panel.input(type="hidden", name="dataset", value=dataset) panel.input(type="hidden", name="pattern", value=pattern) panel.input(type="hidden", name="target", value="msAllele.Search") panel.div.close() panel.form.close() panel.div.close() return panel
def extractHaplotypes(bwt, kmer): forwardIndices = bwt.findIndicesOfStr(kmer) revComp = MultiStringBWT.reverseComplement(kmer) reverseIndices = bwt.findIndicesOfStr(revComp) readLen = 101 patternLen = len(kmer) totalBuffLen = 2*readLen-patternLen modifiedSeqs = [] for i in xrange(forwardIndices[0], forwardIndices[1]): readSeq = bwt.recoverString(i) dollarPos = readSeq.find('$') #calcualte how many tailing '.' we need first, then construct the string from that info afterPattern = readLen-dollarPos-1 modSeq = ('.'*(readLen-patternLen-afterPattern)+ readSeq[dollarPos+1:].lower()+ readSeq[0:patternLen]+ readSeq[patternLen:dollarPos+1].lower()+ '.'*(afterPattern)) modifiedSeqs.append(modSeq) for i in xrange(reverseIndices[0], reverseIndices[1]): revCompSeq = bwt.recoverString(i) readSeq = MultiStringBWT.reverseComplement(revCompSeq) dollarPos = readSeq.find('$') #beforePattern = readLen-dollarPos afterPattern = readLen-dollarPos-patternLen modSeq = ('.'*(readLen-patternLen-afterPattern)+ readSeq[dollarPos:-patternLen].lower()+ readSeq[-patternLen:]+ readSeq[0:dollarPos].lower()+ '.'*(afterPattern)) modifiedSeqs.append(modSeq) finishedHaps = [] previousConsensus = 'A'*totalBuffLen currentConsensus, currentScorer = conSeq(modifiedSeqs) currSeqs = modifiedSeqs while len(currSeqs) > 0 and compareShiftedSeqs(previousConsensus, currentConsensus) > 0: nextSeqs = [] consensusSeqs = [] #we will fill in consensus Seqs downstream finishedHaps.append((currentConsensus, consensusSeqs, [])) #first get all exact matching reads for seq in currSeqs: if compareShiftedSeqs(seq, currentConsensus) == 0: consensusSeqs.append(seq) else: nextSeqs.append(seq) finishedHaps[-1][2].append((0, len(consensusSeqs))) #update these things previousConsensus = currentConsensus currSeqs = nextSeqs currentConsensus, currentScorer = conSeq(currSeqs) #check if the next consensus is identical acceptedScore = 1 while len(currSeqs) > 0 and compareShiftedSeqs(currentConsensus, previousConsensus) == 0: #print 'triggered', acceptedScore nextNextSeqs = [] minScore = 0xFFFFFFFFFFFFFFFF for seq in nextSeqs: calcScore = scoreShiftedSeqs(seq, currentConsensus, currentScorer) if calcScore < minScore and calcScore > acceptedScore: minScore = calcScore if calcScore <= acceptedScore: consensusSeqs.append(seq) else: nextNextSeqs.append(seq) finishedHaps[-1][2].append((acceptedScore, len(nextSeqs)-len(nextNextSeqs))) nextSeqs = nextNextSeqs currSeqs = nextSeqs currentConsensus, currentScorer = conSeq(currSeqs) #acceptedScore += 1 acceptedScore = minScore for seq in currSeqs: consensusSeqs.append(seq) return finishedHaps
def TestForUnique(sample, side, bowtie_dir, species, expected_length): global genome bt2cmd = "bowtie2 -x %s/%s --no-head -r --end-to-end -k 4 %s.seq > %s.sam" designfile = "tmp/bowtie_data/%s_%s.csv" % (sample, side) t = designfile.rfind('.') root = designfile[:t] outfile = root + ".seq" fp = open(outfile, 'wb') design = pd.read_csv(filepath_or_buffer=designfile, sep=',') N = design.shape[0] probes = {} distinct_context = set() for index, row in enumerate(design.values): seq = row[1] fp.write(seq + "\n") distinct_context.add(seq) print "TOtal distinct contexts: %d" % (len(distinct_context)) fp.close() print "Wrote %s (%d lines)" % (outfile, N) sys.stdout.flush() code = subprocess.call(bt2cmd % (bowtie_dir, species, root, root), shell=True) if (code == 0): print "Alignment completed" else: print "Alignment failed:" + (bt2cmd % (bowtie_dir, species, root, root)) return samfile = outfile.replace('.seq', '.sam') columns = [str(i) for i in range(20)] df = pd.read_csv(filepath_or_buffer=samfile, names=columns, sep='\t', header=None) df = df.drop_duplicates(subset=['0'], keep=False) #df.to_csv("tmp/bowtie_data/test.csv") data = df.iloc[:, ].values unique_locations = {} unmapped = 0 pos_set = set() new_data = [] unique = set() for fields in data: index = fields[0] chromo = fields[2] pos = fields[3] flags = int(fields[1]) if chromo == '*': unmapped += 1 continue alignment_score = -100 if fields[11].find("AS:i:") == 0: alignment_score = int(fields[11].split(":")[-1]) if alignment_score < 0: continue strand = '-' if flags & 16 else '+' new_seq = revcomp(fields[9]) if flags & 16 else fields[9] new_data.append([new_seq, chromo, pos, strand]) unique.add(new_seq) locationfile = outfile.replace('.seq', '_location.csv') with open(locationfile, 'wb') as fp: a = csv.writer(fp, delimiter=',') header = ['context', 'chromo', 'pos', 'strand'] a.writerows([header]) for d in new_data: a.writerows([d]) print "Wrote file: %s [%d lines]" % (locationfile, len(new_data)) print "unmapped: %d" % unmapped print "unique: %d" % len(unique) ######## df1 = pd.read_csv(filepath_or_buffer=designfile, sep=',') df2 = pd.read_csv(filepath_or_buffer=locationfile, sep=',') result = pd.merge(df1, df2, how='right', on=['context']) data = result.iloc[:, ].values new_data = [] # ['id', 'side', 'context', 'chromo', 'pos', 'strand'] for d in data: [my_id, context, te, chromo, pos, strand] = d[0:6] #chromo = chromo[3:] if chromo not in genome.keys(): continue plen = len(context) if strand == '+' and side == 'start': ref_prefix = genome[chromo][pos:pos + plen] ref_suffix = genome[chromo][pos + plen:pos + plen + 25] other_context = genome[chromo][pos + plen:pos + plen + 25] pos = pos + plen if strand == '+' and side == 'end': ref_prefix = genome[chromo][pos - 25:pos] ref_suffix = genome[chromo][pos:pos + plen] other_context = genome[chromo][pos - 25:pos] if strand == '-' and side == 'start': ref_prefix = genome[chromo][pos:pos + 25] ref_suffix = genome[chromo][pos - plen:pos] other_context = genome[chromo][pos - 25:pos] if strand == '-' and side == 'end': ref_prefix = genome[chromo][pos + plen:pos + plen + 25] ref_suffix = genome[chromo][pos:pos + plen] other_context = genome[chromo][pos + plen:pos + plen + 25] pos = pos + plen other_context = MultiStringBWT.reverseComplement( other_context) if strand == '-' else other_context if strand == '-': ref_prefix = MultiStringBWT.reverseComplement(ref_prefix) ref_suffix = MultiStringBWT.reverseComplement(ref_suffix) ed_th = .2 * 25 if side == 'start': ed = lv.distance(ref_suffix[:25], te[:25]) ref_te = 1 if ed <= ed_th else 0 if side == 'end': ed = lv.distance(ref_prefix[-25:], te[-25:]) ref_te = 1 if ed <= ed_th else 0 new_data.append([ my_id, context, te, ref_te, ref_prefix, ref_suffix, chromo, pos, strand, len(context) ]) #new_data.append([my_id,context,te,ref_prefix,ref_suffix,chromo,pos,strand]) finalfile = locationfile.replace('location', 'UNIQUE') with open(finalfile, 'wb') as fp: a = csv.writer(fp, delimiter=',') header = [ 'my_id', 'context', 'TE', 'ref_te', 'ref_prefix', 'ref_suffix', 'chromo', 'pos', 'strand', 'clen' ] a.writerows([header]) for d in new_data: a.writerows([d]) remove_duplicates(finalfile) return command = "rm ./tmp/bowtie_data/*.seq" rval = os.system(command) command = "rm ./tmp/bowtie_data/*.sam" rval = os.system(command) command = "rm ./tmp/bowtie_data/*_location.csv" rval = os.system(command)
def mainRun(): ''' This is the primary function for external typical users to run when the Command Line Interface is used ''' #start up the logger initLogger() #attempt to parse the arguments p = ap.ArgumentParser(description=util.DESC, formatter_class=ap.RawTextHelpFormatter) #version data p.add_argument('-V', '--version', action='version', version='%(prog)s' + \ ' %s in MSBWT %s' % (util.VERSION, util.PKG_VERSION)) #TODO: do we want subparsers groups by type or sorted by name? it's type currently sp = p.add_subparsers(dest='subparserID') p2 = sp.add_parser('cffq', help='create a MSBWT from FASTQ files (pp + cfpp)') p2.add_argument('-p', metavar='numProcesses', dest='numProcesses', type=int, default=1, help='number of processes to run (default: 1)') p2.add_argument('-u', '--uniform', dest='areUniform', action='store_true', help='the input sequences have uniform length', default=False) p2.add_argument('-c', '--compressed', dest='buildCompressed', action='store_true', help='build the RLE BWT (faster, less disk I/O)', default=False) p2.add_argument('outBwtDir', type=util.newDirectory, help='the output MSBWT directory') p2.add_argument('inputFastqs', nargs='+', type=util.readableFastqFile, help='the input FASTQ files') p7 = sp.add_parser('pp', help='pre-process FASTQ files before BWT creation') p7.add_argument('-u', '--uniform', dest='areUniform', action='store_true', help='the input sequences have uniform length', default=False) p7.add_argument('outBwtDir', type=util.newDirectory, help='the output MSBWT directory') p7.add_argument('inputFastqs', nargs='+', type=util.readableFastqFile, help='the input FASTQ files') p3 = sp.add_parser('cfpp', help='create a MSBWT from pre-processed sequences and offsets') p3.add_argument('-p', metavar='numProcesses', dest='numProcesses', type=int, default=1, help='number of processes to run (default: 1)') p3.add_argument('-u', '--uniform', dest='areUniform', action='store_true', help='the input sequences have uniform length', default=False) p3.add_argument('-c', '--compressed', dest='buildCompressed', action='store_true', help='build the RLE BWT (faster, less disk I/O)', default=False) p3.add_argument('bwtDir', type=util.existingDirectory, help='the MSBWT directory to process') p4 = sp.add_parser('merge', help='merge many MSBWTs into a single MSBWT') p4.add_argument('-p', metavar='numProcesses', dest='numProcesses', type=int, default=1, help='number of processes to run (default: 1)') p4.add_argument('outBwtDir', type=util.newDirectory, help='the output MSBWT directory') p4.add_argument('inputBwtDirs', nargs='+', type=util.existingDirectory, help='input BWT directories to merge') p5 = sp.add_parser('query', help='search for a sequence in an MSBWT, prints sequence and seqID') p5.add_argument('inputBwtDir', type=util.existingDirectory, help='the BWT to query') p5.add_argument('kmer', type=util.validKmer, help='the input k-mer to search for') p5.add_argument('-d', '--dump-seqs', dest='dumpSeqs', action='store_true', help='print all sequences with the given kmer (default=False)', default=False) p6 = sp.add_parser('massquery', help='search for many sequences in an MSBWT') p6.add_argument('inputBwtDir', type=util.existingDirectory, help='the BWT to query') p6.add_argument('kmerFile', help='a file with one k-mer per line') p6.add_argument('outputFile', help='output file with counts per line') p6.add_argument('-r', '--rev-comp', dest='reverseComplement', action='store_true', help='also search for each kmer\'s reverse complement', default=False) p8 = sp.add_parser('compress', help='compress a MSBWT from byte/base to RLE') p8.add_argument('-p', metavar='numProcesses', dest='numProcesses', type=int, default=1, help='number of processes to run (default: 1)') p8.add_argument('srcDir', type=util.existingDirectory, help='the source directory for the BWT to compress') p8.add_argument('dstDir', type=util.newDirectory, help='the destination directory') p9 = sp.add_parser('decompress', help='decompress a MSBWT from RLE to byte/base') p9.add_argument('-p', metavar='numProcesses', dest='numProcesses', type=int, default=1, help='number of processes to run (default: 1)') p9.add_argument('srcDir', type=util.existingDirectory, help='the source directory for the BWT to compress') p9.add_argument('dstDir', type=util.newDirectory, help='the destination directory') args = p.parse_args() if args.subparserID == 'cffq': logger.info('Inputs:\t'+str(args.inputFastqs)) logger.info('Uniform:\t'+str(args.areUniform)) logger.info('Output:\t'+args.outBwtDir) logger.info('Output Compressed:\t'+str(args.buildCompressed)) logger.info('Processes:\t'+str(args.numProcesses)) if args.numProcesses > 1: logger.warning('Using multi-processing with slow disk accesses can lead to slower build times.') print if args.areUniform: #if they are uniform, use the method developed by Bauer et al., it's likely short Illumina seq if args.buildCompressed: MultiStringBWT.createMSBWTCompFromFastq(args.inputFastqs, args.outBwtDir, args.numProcesses, args.areUniform, logger) else: MultiStringBWT.createMSBWTFromFastq(args.inputFastqs, args.outBwtDir, args.numProcesses, args.areUniform, logger) else: #if they aren't uniform, use the merge method by Holt et al., it's likely longer PacBio seq if args.buildCompressed: logger.error('No compressed builder for non-uniform datasets, compress after creation.') else: Multimerge.createMSBWTFromFastq(args.inputFastqs, args.outBwtDir, args.numProcesses, args.areUniform, logger) elif args.subparserID == 'pp': logger.info('Inputs:\t'+str(args.inputFastqs)) logger.info('Uniform:\t'+str(args.areUniform)) logger.info('Output:\t'+args.outBwtDir) if args.areUniform: #preprocess for Bauer et al. method MultiStringBWT.preprocessFastqs(args.inputFastqs, args.outBwtDir, args.areUniform, logger) else: #preprocess for Holt et al. method numProcs = 1 Multimerge.preprocessFastqs(args.inputFastqs, args.outBwtDir, numProcs, args.areUniform, logger) elif args.subparserID == 'cfpp': logger.info('BWT dir:\t'+args.bwtDir) logger.info('Uniform:\t'+str(args.areUniform)) logger.info('Output Compressed:\t'+str(args.buildCompressed)) logger.info('Processes:\t'+str(args.numProcesses)) if args.numProcesses > 1: logger.warning('Using multi-processing with slow disk accesses can lead to slower build times.') print seqFN = args.bwtDir+'/seqs.npy' offsetFN = args.bwtDir+'/offsets.npy' bwtFN = args.bwtDir+'/msbwt.npy' if args.areUniform: #process it using the column-wise Bauer et al. method if args.buildCompressed: MSBWTCompGenCython.createMsbwtFromSeqs(args.bwtDir, args.numProcesses, logger) else: MSBWTGenCython.createMsbwtFromSeqs(args.bwtDir, args.numProcesses, logger) else: #process it using the Holt et al. merge method if args.buildCompressed: logger.error('No compressed builder for non-uniform datasets, compress after creation.') else: Multimerge.interleaveLevelMerge(args.bwtDir, args.numProcesses, args.areUniform, logger) elif args.subparserID == 'compress': logger.info('Source Directory:'+args.srcDir) logger.info('Dest Directory:'+args.dstDir) logger.info('Processes:'+str(args.numProcesses)) if args.srcDir == args.dstDir: raise Exception('Source and destination directories cannot be the same directory.') print MSBWTGen.compressBWT(args.srcDir+'/msbwt.npy', args.dstDir+'/comp_msbwt.npy', args.numProcesses, logger) elif args.subparserID == 'decompress': logger.info('Source Directory: '+args.srcDir) logger.info('Dest Directory: '+args.dstDir) logger.info('Processes: '+str(args.numProcesses)) print MSBWTGen.decompressBWT(args.srcDir, args.dstDir, args.numProcesses, logger) #TODO: remove if srcdir and dstdir are the same? elif args.subparserID == 'merge': logger.info('Inputs:\t'+str(args.inputBwtDirs)) logger.info('Output:\t'+args.outBwtDir) logger.info('Processes:\t'+str(args.numProcesses)) if args.numProcesses > 1: logger.warning('Multi-processing is not supported at this time, but will be included in a future release.') numProcs = 1 #logger.warning('Using multi-processing with slow disk accesses can lead to slower build times.') print #MSBWTGen.mergeNewMSBWT(args.outBwtDir, args.inputBwtDirs, args.numProcesses, logger) if len(args.inputBwtDirs) > 2: #this is a deprecated method, it may still work if you feel daring #MSBWTGenCython.mergeMsbwts(args.inputBwtDirs, args.outBwtDir, 1, logger) logger.error('Merging more than two MSBWTs at once is not currently supported.') else: GenericMerge.mergeTwoMSBWTs(args.inputBwtDirs[0], args.inputBwtDirs[1], args.outBwtDir, numProcs, logger) elif args.subparserID == 'query': #this is the easiest thing we can do, don't dump the standard info, just do it msbwt = MultiStringBWT.loadBWT(args.inputBwtDir, logger=logger) #always print how many are found, users can parse it out if they want r = msbwt.findIndicesOfStr(args.kmer) print r[1]-r[0] #dump the seqs if request if args.dumpSeqs: for x in xrange(r[0], r[1]): dInd = msbwt.getSequenceDollarID(x) print msbwt.recoverString(dInd)[1:]+','+str(dInd) elif args.subparserID == 'massquery': logger.info('Input:\t'+str(args.inputBwtDir)) logger.info('Queries:\t'+str(args.kmerFile)) logger.info('Output:\t'+args.outputFile) logger.info('Rev-comp:\t'+str(args.reverseComplement)) print msbwt = MultiStringBWT.loadBWT(args.inputBwtDir, logger=logger) output = open(args.outputFile, 'w+') output.write('k-mer,counts') if args.reverseComplement: output.write(',revCompCounts\n') else: output.write('\n') logger.info('Beginning queries...') for line in open(args.kmerFile, 'r'): kmer = line.strip('\n') c = msbwt.countOccurrencesOfSeq(kmer) if args.reverseComplement: rc = msbwt.countOccurrencesOfSeq(MultiStringBWT.reverseComplement(kmer)) output.write(kmer+','+str(c)+','+str(rc)+'\n') else: output.write(kmer+','+str(c)+'\n') logger.info('Queries complete.') else: print args.subparserID+" is currently not implemented, please wait for a future release."
def mainRun(): ''' This is the primary function for external typical users to run when the Command Line Interface is used ''' #start up the logger initLogger() #attempt to parse the arguments p = ap.ArgumentParser(description=util.DESC, formatter_class=ap.RawTextHelpFormatter) #version data p.add_argument('-V', '--version', action='version', version='%(prog)s' + \ ' %s in MSBWT %s' % (util.VERSION, util.PKG_VERSION)) #TODO: do we want subparsers groups by type or sorted by name? it's type currently sp = p.add_subparsers(dest='subparserID') p2 = sp.add_parser('cffq', help='create a MSBWT from FASTQ files (pp + cfpp)') p2.add_argument('-p', metavar='numProcesses', dest='numProcesses', type=int, default=1, help='number of processes to run (default: 1)') p2.add_argument('-u', '--uniform', dest='areUniform', action='store_true', help='the input sequences have uniform length', default=False) p2.add_argument('-c', '--compressed', dest='buildCompressed', action='store_true', help='build the RLE BWT (faster, less disk I/O)', default=False) p2.add_argument('outBwtDir', type=util.newDirectory, help='the output MSBWT directory') p2.add_argument('inputFastqs', nargs='+', type=util.readableFastqFile, help='the input FASTQ files') p7 = sp.add_parser('pp', help='pre-process FASTQ files before BWT creation') p7.add_argument('-u', '--uniform', dest='areUniform', action='store_true', help='the input sequences have uniform length', default=False) p7.add_argument('outBwtDir', type=util.newDirectory, help='the output MSBWT directory') p7.add_argument('inputFastqs', nargs='+', type=util.readableFastqFile, help='the input FASTQ files') p3 = sp.add_parser( 'cfpp', help='create a MSBWT from pre-processed sequences and offsets') p3.add_argument('-p', metavar='numProcesses', dest='numProcesses', type=int, default=1, help='number of processes to run (default: 1)') p3.add_argument('-u', '--uniform', dest='areUniform', action='store_true', help='the input sequences have uniform length', default=False) p3.add_argument('-c', '--compressed', dest='buildCompressed', action='store_true', help='build the RLE BWT (faster, less disk I/O)', default=False) p3.add_argument('bwtDir', type=util.existingDirectory, help='the MSBWT directory to process') p4 = sp.add_parser('merge', help='merge many MSBWTs into a single MSBWT') p4.add_argument('-p', metavar='numProcesses', dest='numProcesses', type=int, default=1, help='number of processes to run (default: 1)') p4.add_argument('outBwtDir', type=util.newDirectory, help='the output MSBWT directory') p4.add_argument('inputBwtDirs', nargs='+', type=util.existingDirectory, help='input BWT directories to merge') p5 = sp.add_parser( 'query', help='search for a sequence in an MSBWT, prints sequence and seqID') p5.add_argument('inputBwtDir', type=util.existingDirectory, help='the BWT to query') p5.add_argument('kmer', type=util.validKmer, help='the input k-mer to search for') p5.add_argument( '-d', '--dump-seqs', dest='dumpSeqs', action='store_true', help='print all sequences with the given kmer (default=False)', default=False) p6 = sp.add_parser('massquery', help='search for many sequences in an MSBWT') p6.add_argument('inputBwtDir', type=util.existingDirectory, help='the BWT to query') p6.add_argument('kmerFile', help='a file with one k-mer per line') p6.add_argument('outputFile', help='output file with counts per line') p6.add_argument('-r', '--rev-comp', dest='reverseComplement', action='store_true', help='also search for each kmer\'s reverse complement', default=False) p8 = sp.add_parser('compress', help='compress a MSBWT from byte/base to RLE') p8.add_argument('-p', metavar='numProcesses', dest='numProcesses', type=int, default=1, help='number of processes to run (default: 1)') p8.add_argument('srcDir', type=util.existingDirectory, help='the source directory for the BWT to compress') p8.add_argument('dstDir', type=util.newDirectory, help='the destination directory') p9 = sp.add_parser('decompress', help='decompress a MSBWT from RLE to byte/base') p9.add_argument('-p', metavar='numProcesses', dest='numProcesses', type=int, default=1, help='number of processes to run (default: 1)') p9.add_argument('srcDir', type=util.existingDirectory, help='the source directory for the BWT to compress') p9.add_argument('dstDir', type=util.newDirectory, help='the destination directory') p10 = sp.add_parser('convert', help='convert from a raw text input to RLE') p10.add_argument('-i', metavar='inputTextFN', dest='inputTextFN', default=None, help='input text filename (default: stdin)') p10.add_argument('dstDir', type=util.newDirectory, help='the destination directory') args = p.parse_args() if args.subparserID == 'cffq': logger.info('Inputs:\t' + str(args.inputFastqs)) logger.info('Uniform:\t' + str(args.areUniform)) logger.info('Output:\t' + args.outBwtDir) logger.info('Output Compressed:\t' + str(args.buildCompressed)) logger.info('Processes:\t' + str(args.numProcesses)) if args.numProcesses > 1: logger.warning( 'Using multi-processing with slow disk accesses can lead to slower build times.' ) print if args.areUniform: #if they are uniform, use the method developed by Bauer et al., it's likely short Illumina seq if args.buildCompressed: MultiStringBWT.createMSBWTCompFromFastq( args.inputFastqs, args.outBwtDir, args.numProcesses, args.areUniform, logger) else: MultiStringBWT.createMSBWTFromFastq(args.inputFastqs, args.outBwtDir, args.numProcesses, args.areUniform, logger) else: #if they aren't uniform, use the merge method by Holt et al., it's likely longer PacBio seq if args.buildCompressed: logger.error( 'No compressed builder for non-uniform datasets, compress after creation.' ) else: Multimerge.createMSBWTFromFastq(args.inputFastqs, args.outBwtDir, args.numProcesses, args.areUniform, logger) elif args.subparserID == 'pp': logger.info('Inputs:\t' + str(args.inputFastqs)) logger.info('Uniform:\t' + str(args.areUniform)) logger.info('Output:\t' + args.outBwtDir) if args.areUniform: #preprocess for Bauer et al. method MultiStringBWT.preprocessFastqs(args.inputFastqs, args.outBwtDir, args.areUniform, logger) else: #preprocess for Holt et al. method numProcs = 1 Multimerge.preprocessFastqs(args.inputFastqs, args.outBwtDir, numProcs, args.areUniform, logger) elif args.subparserID == 'cfpp': logger.info('BWT dir:\t' + args.bwtDir) logger.info('Uniform:\t' + str(args.areUniform)) logger.info('Output Compressed:\t' + str(args.buildCompressed)) logger.info('Processes:\t' + str(args.numProcesses)) if args.numProcesses > 1: logger.warning( 'Using multi-processing with slow disk accesses can lead to slower build times.' ) print seqFN = args.bwtDir + '/seqs.npy' offsetFN = args.bwtDir + '/offsets.npy' bwtFN = args.bwtDir + '/msbwt.npy' if args.areUniform: #process it using the column-wise Bauer et al. method if args.buildCompressed: MSBWTCompGenCython.createMsbwtFromSeqs(args.bwtDir, args.numProcesses, logger) else: MSBWTGenCython.createMsbwtFromSeqs(args.bwtDir, args.numProcesses, logger) else: #process it using the Holt et al. merge method if args.buildCompressed: logger.error( 'No compressed builder for non-uniform datasets, compress after creation.' ) else: Multimerge.interleaveLevelMerge(args.bwtDir, args.numProcesses, args.areUniform, logger) elif args.subparserID == 'compress': logger.info('Source Directory:' + args.srcDir) logger.info('Dest Directory:' + args.dstDir) logger.info('Processes:' + str(args.numProcesses)) if args.srcDir == args.dstDir: raise Exception( 'Source and destination directories cannot be the same directory.' ) print MSBWTGen.compressBWT(args.srcDir + '/msbwt.npy', args.dstDir + '/comp_msbwt.npy', args.numProcesses, logger) elif args.subparserID == 'decompress': logger.info('Source Directory: ' + args.srcDir) logger.info('Dest Directory: ' + args.dstDir) logger.info('Processes: ' + str(args.numProcesses)) print MSBWTGen.decompressBWT(args.srcDir, args.dstDir, args.numProcesses, logger) #TODO: remove if srcdir and dstdir are the same? elif args.subparserID == 'merge': logger.info('Inputs:\t' + str(args.inputBwtDirs)) logger.info('Output:\t' + args.outBwtDir) logger.info('Processes:\t' + str(args.numProcesses)) if args.numProcesses > 1: logger.warning( 'Multi-processing is not supported at this time, but will be included in a future release.' ) numProcs = 1 #logger.warning('Using multi-processing with slow disk accesses can lead to slower build times.') print #MSBWTGen.mergeNewMSBWT(args.outBwtDir, args.inputBwtDirs, args.numProcesses, logger) if len(args.inputBwtDirs) > 2: #this is a deprecated method, it may still work if you feel daring #MSBWTGenCython.mergeMsbwts(args.inputBwtDirs, args.outBwtDir, 1, logger) logger.error( 'Merging more than two MSBWTs at once is not currently supported.' ) else: GenericMerge.mergeTwoMSBWTs(args.inputBwtDirs[0], args.inputBwtDirs[1], args.outBwtDir, numProcs, logger) elif args.subparserID == 'query': #this is the easiest thing we can do, don't dump the standard info, just do it msbwt = MultiStringBWT.loadBWT(args.inputBwtDir, logger=logger) #always print how many are found, users can parse it out if they want r = msbwt.findIndicesOfStr(args.kmer) print r[1] - r[0] #dump the seqs if request if args.dumpSeqs: for x in xrange(r[0], r[1]): dInd = msbwt.getSequenceDollarID(x) print msbwt.recoverString(dInd)[1:] + ',' + str(dInd) elif args.subparserID == 'massquery': logger.info('Input:\t' + str(args.inputBwtDir)) logger.info('Queries:\t' + str(args.kmerFile)) logger.info('Output:\t' + args.outputFile) logger.info('Rev-comp:\t' + str(args.reverseComplement)) print msbwt = MultiStringBWT.loadBWT(args.inputBwtDir, logger=logger) output = open(args.outputFile, 'w+') output.write('k-mer,counts') if args.reverseComplement: output.write(',revCompCounts\n') else: output.write('\n') logger.info('Beginning queries...') for line in open(args.kmerFile, 'r'): kmer = line.strip('\n') c = msbwt.countOccurrencesOfSeq(kmer) if args.reverseComplement: rc = msbwt.countOccurrencesOfSeq( MultiStringBWT.reverseComplement(kmer)) output.write(kmer + ',' + str(c) + ',' + str(rc) + '\n') else: output.write(kmer + ',' + str(c) + '\n') logger.info('Queries complete.') elif args.subparserID == 'convert': if args.inputTextFN == None: logger.info('Input: stdin') else: logger.info('Input: ' + args.inputTextFN) logger.info('Output: ' + args.dstDir) logger.info('Beginning conversion...') CompressToRLE.compressInput(args.inputTextFN, args.dstDir) logger.info('Finished conversion.') else: print args.subparserID + " is currently not implemented, please wait for a future release."
def firstTimeExtension(self, foundKmers, unexploredPaths, nodes, edges): ''' @param foundKmers - Each kmer we find will be checked against this and added if not present @param unexploredPaths - if we find a new path split, we add the things here, also merges are important to add here @param nodes - the list of nodes if we find a new one ''' pc = '' kmer = self.seq terminate = False while not terminate: if len(kmer) != self.pathK: print 'ERROR: DIFFERENT SIZED K-MER '+str(len(kmer)) raise Exception('ERROR') #First, perform all the counts of paths going both forwards and backwards counts = {} revCounts = {} #maxV - the count of the (k+1)-mer with maxC on it, total is the total counts of valid chars maxV = 0 maxC = '' total = 0 #count the number of forward and reversed paths numPaths = 0 numRevPaths = 0 for c in self.validChars: counts[c] = self.msbwt.countOccurrencesOfSeq(kmer+c)+self.msbwt.countOccurrencesOfSeq(MultiStringBWT.reverseComplement(kmer+c)) revCounts[c] = self.msbwt.countOccurrencesOfSeq(c+kmer)+self.msbwt.countOccurrencesOfSeq(MultiStringBWT.reverseComplement(c+kmer)) if self.drawDollarTerminals or c != '$': total += counts[c] if counts[c] > maxV: maxV = counts[c] maxC = c if counts[c] >= self.pathThreshold: numPaths += 1 #if we have evidence from the counts OR if the previous character was known to be that character if revCounts[c] >= self.pathThreshold or c == pc: numRevPaths += 1 #check if we have incoming edges, in which case we need to end this block if numRevPaths > 1 and kmer != self.seq: #remove the last kmer, because it's actually in the new node we merge into self.seq = self.seq[0:-1] #this will lead to repeating the same counts later, but that's okay newID = len(nodes) newHistMers = set([]) nodes.append(PathNode(newID, kmer, self.msbwt, self.minDistToSeed+len(self.pileups), self.settingsDict)) edges.append(PathEdge(len(edges), self.nodeID, newID, revCounts[pc], pc+', '+str(revCounts))) self.termCondition = 'MERGE_'+str(newID) foundKmers[kmer] = newID unexploredPaths.append(nodes[newID]) #print 'Ending block for merge' terminate = True elif total == 0: #print 'No strings found.' self.termCondition = 'TERMINAL' terminate = True else: #the kmer was found in this block and it may have multiple extensions foundKmers[kmer] = self.nodeID revMer = MultiStringBWT.reverseComplement(kmer) if foundKmers.has_key(revMer): otherID = foundKmers[revMer] self.inversionSet.add(otherID) nodes[otherID].inversionSet.add(self.nodeID) r1 = self.msbwt.findIndicesOfStr(kmer[-self.countK:]) r2 = self.msbwt.findIndicesOfStr(MultiStringBWT.reverseComplement(kmer[-self.countK:])) kmerCount = (r1[1]-r1[0])+(r2[1]-r2[0]) self.pileups.append(kmerCount) perc = float(maxV)/total if self.trackReads == True: for i in xrange(r1[0], r1[1]): self.readSet.add((int(self.msbwt.getSequenceDollarID(i)), 0)) for i in xrange(r2[0], r2[1]): self.readSet.add((int(self.msbwt.getSequenceDollarID(i)), 1)) #if kmerCount > self.overloadThreshold: if self.pileups[0] > self.overloadThreshold: #this path is too heavy, we probably won't figure out what's going on downstream self.termCondition = 'OVERLOAD' terminate = True elif numPaths > 1: self.termCondition = 'SPLIT' for c in self.validChars: if counts[c] >= self.pathThreshold: newKmer = kmer[1:]+c if foundKmers.has_key(newKmer): otherNID = foundKmers[newKmer] nodes[otherNID].minDistToSeed = min(nodes[otherNID].minDistToSeed, self.minDistToSeed+len(self.pileups)) edges.append(PathEdge(len(edges), self.nodeID, otherNID, counts[c], c+': '+str(counts[c]))) else: if self.drawDollarTerminals or c != '$': newID = len(nodes) newHistMers = set([]) nodes.append(PathNode(newID, newKmer, self.msbwt, self.minDistToSeed+len(self.pileups), self.settingsDict)) edges.append(PathEdge(len(edges), self.nodeID, newID, counts[c], c+': '+str(counts[c]))) foundKmers[newKmer] = newID if c != '$': unexploredPaths.append(nodes[newID]) else: nodes[newID].termCondition = '$ Ext' terminate = True else: #this is data pertaining to this k-mer #print ':\t'+kmer+maxC+'\t'+str(perc)+'\t'+str(maxV)+'/'+str(total)+'\t'+str(total-maxV)+'\t' pc = kmer[0] kmer = kmer[1:]+maxC #check if we've found the new k-mer before if foundKmers.has_key(kmer): otherNID = foundKmers[kmer] nodes[otherNID].minDistToSeed = min(nodes[otherNID].minDistToSeed, self.minDistToSeed+len(self.pileups)) if counts[maxC] >= self.pathThreshold: edges.append(PathEdge(len(edges), self.nodeID, otherNID, counts[maxC], pc+': '+str(counts[maxC]))) self.termCondition = 'MERGE_'+str(otherNID) else: edges.append(PathEdge(len(edges), self.nodeID, otherNID, counts[maxC], pc+': '+str(counts[maxC]), 'dashed')) self.termCondition = 'MERGE_'+str(otherNID)+', THRESHOLD' terminate = True else: self.seq += maxC if maxC == '$': self.termCondition = '$ Max' terminate = True
def extendSeed(self, seedKmer, endSeeds): ''' This function is intended to be an interactive technique for constructing transcripts, probably to be released in a future version of msbwt @param bwtFN - the filename of the BWT to load @param seedKmer - the seed sequence to use for construction @param threshold - minimum number for a path to be considered a path @param direction - True is forward, False is backward @param logger - the logger @param ''' if self.foundKmers.has_key(seedKmer): return pathK = self.settingsDict.get('kmerSize', len(seedKmer)) countK = self.settingsDict.get('countK', pathK) isMerged = self.settingsDict.get('isMerged', False) trackPairs = self.settingsDict.get('trackPairs', False) trackReads = self.settingsDict.get('trackReads', False) useMemmap = self.settingsDict.get('useMemmap', True) maxDistance = self.settingsDict.get('maxDistance', 0xFFFFFFFF) if len(seedKmer) != pathK: raise Exception('Seed k-mer incorrect length') numNodes = self.settingsDict['numNodes'] validChars = ['$', 'A', 'C', 'G', 'N', 'T'] if self.logger != None: self.logger.info('Loading '+self.bwtDir+'...') msbwt = MultiStringBWT.loadBWT(self.bwtDir, useMemmap, self.logger) if os.path.exists(self.bwtDir+'/origins.npy'): raise Exception("You haven\'t reimplemented the handling of origin files") origins = np.load(self.bwtDir+'/origins.npy', 'r') else: origins = None self.settingsDict['interleaveFN'] = self.bwtDir+'/inter0.npy' kmer = seedKmer firstID = len(self.nodes) self.nodes.append(PathNode(firstID, kmer, msbwt, 0, self.settingsDict)) self.foundKmers[kmer] = firstID for i, endSeed in enumerate(endSeeds): if len(endSeed) != pathK: raise Exception(endSeed+': NOT CORRECT LENGTH') else: endID = len(self.nodes) self.nodes.append(PathNode(endID, endSeed, msbwt, 0, self.settingsDict)) self.nodes[endID].termCondition = 'END_SEED_'+str(i) self.foundKmers[endSeed] = endID if self.logger != None: self.logger.info('Beginning with seed \''+seedKmer+'\', pathK='+str(pathK)+', countK='+str(countK)) unexploredPaths = [self.nodes[firstID]] #init the kmer dictionary execID = firstID while len(unexploredPaths) > 0: #uncomment to make this smallest first unexploredPaths.sort(key = lambda node: node.minDistToSeed) #print 'UP: '+'['+','.join([str((node.minDistToSeed, node.nodeID)) for node in unexploredPaths])+']' nextNode = unexploredPaths.pop(0) if nextNode.nodeID >= numNodes: nextNode.termCondition = 'UNEXPLORED_NODE' elif nextNode.minDistToSeed >= maxDistance: nextNode.termCondition = 'UNEXPLORED_DIST' else: nextNode.execOrder = execID execID += 1 if self.logger != None: self.logger.info('Exploring new node') nextNode.firstTimeExtension(self.foundKmers, unexploredPaths, self.nodes, self.edges) if isMerged and trackReads: interleaveFN = self.bwtDir+'/inter0.npy' interleave = np.load(interleaveFN, 'r') #we only need to do this for newly processed nodes for node in self.nodes[firstID:]: dIDs = node.readSet for dID in dIDs: sourceID = interleave[dID[0]] node.sourceCounts[sourceID] = node.sourceCounts.get(sourceID, 0)+1 if trackPairs: abtFN = self.bwtDir+'/abt.npy' abt = np.load(abtFN, 'r') #abtDict = {} #only need to process new nodes for node in self.nodes[firstID:]: dIDs = node.readSet for dID, direction in dIDs: (fID, rID) = abt[dID] if fID % 2 == 0: oFID = fID+1 else: oFID = fID-1 if self.abtDict.has_key((oFID, rID, 1-direction)): otherNIDs = self.abtDict[(oFID, rID, 1-direction)][1] for n in otherNIDs: self.nodes[n].pairedNodes[node.nodeID] = self.nodes[n].pairedNodes.get(node.nodeID, 0)+1 node.pairedNodes[n] = node.pairedNodes.get(n, 0)+1 if not self.abtDict.has_key((fID, rID, direction)): self.abtDict[(fID, rID, direction)] = (dID, set([])) self.abtDict[(fID, rID, direction)][1].add(node.nodeID)
def __init__(self): self.msbwt = MSBWT.loadBWT(sys.argv[1])
def get_kmer_count(msbwt, kmer): c1 = msbwt.countOccurrencesOfSeq(kmer) c2 = msbwt.countOccurrencesOfSeq(MultiStringBWT.reverseComplement(kmer)) return c1 + c2