def checkDiFrequency(dataSeqsFN, simSeqsFN, outFN): dataSeqs = cgDL.listFromColumns(dataSeqsFN, [0], ['string']) simSeqs = cgDL.listFromColumns(simSeqsFN, [0], ['string']) def returnDiFreq(seqs): #collect di count di_freq = {} totalFrames = 0.0 for seq in seqs: dis = bioLibCG.returnFrames(seq, 2) totalFrames += len(dis) for di in dis: di_freq[di] = di_freq.get(di, 0) + 1.0 #convert to frequencey di_freq = dict((x, di_freq[x] / totalFrames) for x in di_freq) return di_freq data_freqs = returnDiFreq(dataSeqs) sim_freqs = returnDiFreq(simSeqs) with open(outFN, 'w') as f: for di in data_freqs: f.write('%s\t%s\t%s\n' % (di, data_freqs.get(di, 0.0), sim_freqs.get(di, 0.0)))
def checkDiFrequency(dataSeqsFN, simSeqsFN, outFN): dataSeqs = cgDL.listFromColumns(dataSeqsFN, [0], ['string']) simSeqs = cgDL.listFromColumns(simSeqsFN, [0], ['string']) def returnDiFreq(seqs): #collect di count di_freq = {} totalFrames = 0.0 for seq in seqs: dis = bioLibCG.returnFrames(seq, 2) totalFrames += len(dis) for di in dis: di_freq[di] = di_freq.get(di, 0) + 1.0 #convert to frequencey di_freq = dict( (x, di_freq[x]/totalFrames) for x in di_freq) return di_freq data_freqs = returnDiFreq(dataSeqs) sim_freqs = returnDiFreq(simSeqs) with open(outFN, 'w') as f: for di in data_freqs: f.write('%s\t%s\t%s\n' % (di, data_freqs.get(di, 0.0), sim_freqs.get(di, 0.0)))
def plotTotalSNR(fN): fig = PLT.figure() ax1 = fig.add_subplot(111, projection='3d') Xs, Ys, dZs = cgDL.listFromColumns(fN, [0,1,2], ['float', 'float', 'float'], naToZero = True) #xpos = [1,2,3] #ypos = [1,2,3] #zpos = [0,0,0] Zs = [0] * len(dZs) dZs = [1 if x == 0.0 else x for x in dZs] dx = dy = [.2] * len(Zs) ax1.bar3d(Xs, Ys, Zs, dx, dy, dZs, color='#8E4585', zsort = 'max') PLT.show()
def getMicroHistExpression(microFN, fqFile, outFN): microSeqs = cgDL.listFromColumns(microFN, [0], ['string']) microSeq_count = dict( (seq, 0) for seq in microSeqs) f = open(fqFile, 'r') for line in f: possibleSeq = line.strip() if possibleSeq in microSeq_count: microSeq_count[possibleSeq] += 1 f.close() with open(outFN, 'w') as f: for seq, count in microSeq_count.iteritems(): f.write('%s\t%s\n' % (seq, count))
def checkMaskEnds(maskPerLineFN): masks = cgDL.listFromColumns(maskPerLineFN, [0], ['string']) index_numMM = dict((i, 0) for i in range(10)) for mask in masks: mask = mask[::-1] for i, char in enumerate(mask): if i == 10: break if char == 'X': index_numMM[i] += 1 for i, num in index_numMM.items(): print i, num
def getMicroHistExpression(microFN, fqFile, outFN): microSeqs = cgDL.listFromColumns(microFN, [0], ['string']) microSeq_count = dict((seq, 0) for seq in microSeqs) f = open(fqFile, 'r') for line in f: possibleSeq = line.strip() if possibleSeq in microSeq_count: microSeq_count[possibleSeq] += 1 f.close() with open(outFN, 'w') as f: for seq, count in microSeq_count.iteritems(): f.write('%s\t%s\n' % (seq, count))
def checkMaskEnds(maskPerLineFN): masks = cgDL.listFromColumns(maskPerLineFN, [0], ['string']) index_numMM = dict((i,0) for i in range(10)) for mask in masks: mask = mask[::-1] for i, char in enumerate(mask): if i == 10: break if char == 'X': index_numMM[i] += 1 for i, num in index_numMM.items(): print i, num
def grepBC(grepList, inFile, column, word=False): grepList = cgDL.listFromColumns(grepList, [column], ["int"]) f = open(inFile, "r") for line in f: ls = line.strip().split("\t") for w in grepList: if word: if w == ls[column]: print line, else: if w in ls[column]: print line, f.close()
def plotTotalSNR(fN): fig = PLT.figure() ax1 = fig.add_subplot(111, projection='3d') Xs, Ys, dZs = cgDL.listFromColumns(fN, [0, 1, 2], ['float', 'float', 'float'], naToZero=True) #xpos = [1,2,3] #ypos = [1,2,3] #zpos = [0,0,0] Zs = [0] * len(dZs) dZs = [1 if x == 0.0 else x for x in dZs] dx = dy = [.2] * len(Zs) ax1.bar3d(Xs, Ys, Zs, dx, dy, dZs, color='#8E4585', zsort='max') PLT.show()
def generateLeftRightChimera(oRNAFN, outFNBase): '''make map of 4mer --> left, 4mer rights NOTE: left and right do not include middle!''' #FN is id/sequence sequences = cgDL.listFromColumns(oRNAFN, [1], ['string']) #update left/rights fourMer_left = {} fourMer_right = {} for seq in sequences: middle = seq[9:13] left, right = seq[:9], seq[13:] fourMer_left.setdefault(middle, set()).add(left) fourMer_right.setdefault(middle, set()).add(right) #lefts and rights mapped by middle 4 for side in ['left', 'right']: outDict = eval('fourMer_%s' % side) #uh oh! EVAL TIME! with open(outFNBase + '.' + side, 'w') as f: for mer, seqs in outDict.iteritems(): print mer, seqs f.write('%s\t%s\n' % (mer, ','.join(seqs)))
def maskAll(fN): allSeqs = cgDL.listFromColumns(fN, [2], ['string']) allMasks = [multiMask(x, list(), True) for x in allSeqs]