def averageLoopsWithControl(loopPositions, filename, cg=1, pad = 8 ): mymaps = averageLoops(loopPositions, filename, pad = pad) mymaps2 = averageLoops(controlLoops(loopPositions), filename, pad = pad ) "bla26" if cg != 1: mymaps = [coarsegrain(1. * i, cg) / (cg ** 2) for i in mymaps] mymaps2 = [coarsegrain(1. * i, cg) / (cg ** 2) for i in mymaps2] return mymaps, mymaps2
def plot_with_annotations(arr, title, cmap, peaks, contactDomains, forw, rev, showPlot=True, maxPercentile=99.9, extendFactor=0.10): fig, ax0 = plt.subplots() im0 = ax0.matshow(arr, cmap=cmap, vmax = np.nanpercentile(arr, maxPercentile)) ax0.set_title(title, y=1.08) ax0.set_ylim(64,-20) fig.colorbar(im0, ax=ax0) # CTCF site information trim = extendFactor npoints = len(forw) orig_size = int(npoints / (1 + (trim * 2))) remove_each = int((npoints - orig_size) / 2) forwTrim = forw[remove_each:npoints - remove_each] forwTrimC = coarsegrain(forwTrim, 50) revTrim = rev[remove_each:npoints - remove_each] revTrimC = coarsegrain(revTrim, 50) forwTrimC[forwTrimC >1] =1 revTrimC[revTrimC >1] =1 # where on the chart we're plotting siteStart = -10 siteExpand = 9 # add lines for CTCF sites ax0.vlines([np.where(forwTrimC != 0)], ymin=siteStart, ymax=siteStart - (forwTrimC[np.where(forwTrimC != 0)] * siteExpand), color='red') ax0.vlines([np.where(revTrimC != 0)], ymin=siteStart, ymax=siteStart + (revTrimC[np.where(revTrimC != 0)] * siteExpand), color='blue') # add lines all the way down the plot ax0.vlines([np.where(forwTrimC != 0)], ymin=-20, ymax=[np.where(forwTrimC != 0)], color='magenta', lw=0.5, linestyles='dashed') ax0.vlines([np.where(revTrimC != 0)], ymin=-20, ymax=[np.where(revTrimC != 0)], color='aqua', lw=0.5, linestyles='dashed') ax0.hlines([np.where(forwTrimC != 0)], xmax=arr.shape[0]-1, xmin=[np.where(forwTrimC != 0)], color='magenta', lw=0.5, linestyles='dashed') ax0.hlines([np.where(revTrimC != 0)], xmax=arr.shape[0]-1, xmin=[np.where(revTrimC != 0)], color='aqua', lw=0.5, linestyles='dashed') # add information on domains and peak calls from HiC for i in range(contactDomains.shape[0]): side = (contactDomains['x2'][i] - contactDomains['x1'][i]) xy=[contactDomains['x1'][i], contactDomains['x1'][i] + side] p = patches.Rectangle([x - 0.5 for x in xy], width = side, height= -1*side, fill=False, edgecolor='palegreen') ax0.add_patch(p) for i in range(peaks.shape[0]): side = 1 xy=[peaks['x1'][i] - 0.5, peaks['y1'][i] + 0.5] p = patches.Rectangle([x for x in xy], width = side, height= -1*side, fill=False, edgecolor='palegreen') ax0.add_patch(p) for i in range(peaks.shape[0]): side = 1 xy=[peaks['y1'][i] - 0.5,peaks['x1'][i] + 0.5] p = patches.Rectangle([x for x in xy], width = side, height= -1*side, fill=False, edgecolor='palegreen') ax0.add_patch(p) if showPlot: plt.show(block=False)
def do_extruder_position(forw, rev, SEPARATION=200, LIFETIME=300, nsim=10, trim=0, bin_size=0): """Return extuder positioning after nsim steps of simulation returns: logarr: log-transformed array of extruder positioning """ # number of monomers to simulate N = len(forw) shared_arr = mp.Array(ctypes.c_double, N**2) arr = tonumpyarray(shared_arr) arr.shape = (N, N) # can do parallel with fmap, not doing that here # setExceptionHook() # fmap(doSim, range(30), n = 1 ) # number of threads to use. # On a 20-core machine I use 20. [doSim(i, N, SEPARATION, LIFETIME, shared_arr, forw, rev) for i in range(nsim)] # trim before coarsegraining, if desired if trim > 0: print('trimming ' + str(arr.shape)) npoints = arr.shape[0] origSize = int(npoints / (1 + (trim * 2))) removeTotal = npoints - origSize if removeTotal % 2 != 0: removeLeft = int(np.floor(removeTotal / 2)) removeRight = int(np.ceil(removeTotal / 2)) else: removeLeft = removeRight = removeTotal // 2 arr = arr[removeLeft:npoints - removeRight, removeLeft:npoints - removeRight] print('done trimming ' + str(arr.shape)) # bin to a lower resolution if desired if bin_size > 0: arr = coarsegrain(arr, bin_size) arr = np.clip(arr, 0, np.percentile(arr, 99.9)) arr /= np.mean(np.sum(arr, axis=1)) logarr = np.log(arr + 0.0001) return(logarr)
def saddlePlot(): "plot of values ordered by Eig1GW" #plt.figure(figsize = (1.5,1.5)) plt.figure(figsize=(3, 3)) Tanay = binnedData(1000000) Tanay.simpleLoad("../data/GM-all-hg18-1M", "GM-all") Tanay.removeDiagonal(1) Tanay.removePoorRegions() Tanay.removeZeros() Tanay.fakeCis() Tanay.iterativeCorrectWithoutSS() Tanay.doEig() PC = Tanay.EIG["GM-all"][:, 0] if PC[0] > 0: PC = -PC def reorder(data, array=PC): inds = numpy.argsort(array) ndata = data[inds, :] return ndata[:, inds] toplot = (coarsegrain(reorder(Tanay.dataDict["GM-all"]), 60)) toplot /= toplot.mean() toplot = numpy.log(toplot) sh = toplot.shape toplot = toplot.reshape((-1)) ag = numpy.argmax(toplot) toplot[ag] = 0 toplot[ag] = numpy.max(toplot) toplot.shape = sh toplot[0, -1] = toplot[0, -2] toplot[-1, 0] = toplot[-2, 0] plt.imshow(toplot, vmin=toplot.min(), vmax=toplot.max(), interpolation="nearest") cbar = plt.colorbar(orientation="vertical") #labels = ["10","100","1000","10000"] #cbar.ax.set_xticklabels(labels) cbar.ax.set_xlabel("Log(relative contact probability)", fontsize=6) for xlabel_i in cbar.ax.get_xticklabels(): xlabel_i.set_fontsize(6) cbar.set_ticks([-0.5, 0, 0.5, 1]) removeBorder() mirnylib.plotting.niceShow()
def refine_paper(filename, create=True): """filename[0] is a list of filenames of incoming files filename[1] is a folder for outgoing file""" if create == True: for onename in filename[0]: #Parsing individual files if not os.path.exists(onename): raise StandardError("path not found: %s" % onename) TR = HiCdataset("bla", genome=genomeFolder, enzymeName="HindIII",maximumMoleculeLength=500, inMemory=True) print "\nTesting loading new data without rsite information " TR.parseInputData(dictLike=onename, enzymeToFillRsites="HindIII") #assert len(TR.DS) == 856143 #assert len(TR.ufragments) == 634572 TR.save(onename + "_parsed.frag") #Merging files alltogether, applying filters TR = HiCdataset(filename[1] + "_merged.frag",enzymeName = "HindIII", genome=genomeFolder, mode="w") TR.merge([i + "_parsed.frag" for i in filename[0]]) TR = HiCdataset("refined", genome=genomeFolder,enzymeName = "HindIII", mode="w", inMemory=True) print "\nTesting chunking during all tests" TR.chunksize = 30000 #because we do many operations, we disable autoFlush here TR.load(filename[1] + "_merged.frag") print "\nTesting Rsite filter" TR.filterRsiteStart(offset=5) #assert len(TR.DS) == 832110 print "\nTesting duplicate filter" TR.filterDuplicates(chunkSize = 30000) #assert len(TR.DS) == 830275 print "\nTesting small/large and extreme fragment filter" TR.filterLarge() #assert len(TR.DS) == 825442 TR.filterExtreme(cutH=0.005, cutL=0) TR.writeFilteringStats() #assert len(TR.DS) == 803845 #------------------------------------------- TR.printMetadata(saveTo="metadata") import cPickle stop = False mdata = cPickle.load(open("sampleMetadata")) for i in sorted(mdata.keys()): if TR.metadata[i] != mdata[i]: print "Key {0} is not consistent: should be {1}, is {2}".format(i, mdata[i], TR.metadata[i]) stop = True if stop == True: print ("""------------_ERROR_-------------- Inconsistent metadata: see above ----------------------------------------""") raise ValueError("Inconsistent Metadata") print "Testing allxall and by-chromosome heatmap counting diagonal twice" print "----> saving allxall heatmap" TR.saveHeatmap(filename[1] + "-1M.hm", 1000000, countDiagonalReads="twice") a = h5dict(filename[1] + "-1M.hm") st, end = TR.genome.chrmStartsBinCont[1], TR.genome.chrmEndsBinCont[1] st2, end2 = TR.genome.chrmStartsBinCont[2], TR.genome.chrmEndsBinCont[2] chrom1 = a["heatmap"][st:end, st:end] chrom12 = a["heatmap"][st:end, st2:end2] setExceptionHook() print "----> saving by chromosome heatmap" TR.saveByChromosomeHeatmap( filename[1] + "-1M.hm", resolution=1000000, includeTrans=True, countDiagonalReads="twice") b = h5dict(filename[1] + "-1M.hm")["1 1"] bb = h5dict(filename[1] + "-1M.hm")["1 2"] assert (b - chrom1).sum() == 0 print "Cis heatmap consistent" assert (bb - chrom12).sum() == 0 print 'Trans heatmap consistent' print a["heatmap"][::10, ::10].sum() #assert a["heatmap"][::10, ::10].sum() == 21800 print "Heatmap sum correct\n" #--------------------------------- print "Testing allxall and by-chromosome heatmap counting diagonal once" TR.saveHeatmap(filename[1] + "-1M.hm", 1000000, countDiagonalReads="once") Ta = h5dict(filename[1] + "-1M.hm") st, end = TR.genome.chrmStartsBinCont[1], TR.genome.chrmEndsBinCont[1] st2, end2 = TR.genome.chrmStartsBinCont[2], TR.genome.chrmEndsBinCont[2] chrom1 = Ta["heatmap"][st:end, st:end] chrom12 = Ta["heatmap"][st:end, st2:end2] setExceptionHook() print "----> saving by chromosome heatmap" TR.saveByChromosomeHeatmap( filename[1] + "-1M-byChr.hm", resolution=1000000, includeTrans=True, countDiagonalReads="once") TR.saveHiResHeatmapWithOverlaps(filename[1]+"-1M-highRes.hm", resolution=50000, countDiagonalReads="twice") TR.saveSuperHighResMapWithOverlaps(filename[1]+"-5k-SuperHighRes.hm", resolution=5000,chromosomes = [14], countDiagonalReads="twice") Tb = h5dict(filename[1] + "-1M-byChr.hm")["1 1"] Tbb = h5dict(filename[1] + "-1M-byChr.hm")["1 2"] assert ((Tb - chrom1) == 0).all() assert ((Tbb - chrom12) == 0).all() assert ((Tb + np.diag(np.diag(Tb))) == b).all() print "Diagonal counting methods are consistent\n" newchrom1 = chrom1.copy() for i in xrange(len(newchrom1)): newchrom1[i,i] = 2 * newchrom1[i,i] Tb = h5dict(filename[1] + "-1M-highRes.hm")["1 1"] assert np.abs(Tb.sum() - newchrom1.sum()) < 1 assert np.sum(np.abs(coarsegrain(Tb,20,True) - newchrom1)) < 500 #------------------------------ print "Testing updateGenome method" from mirnylib.genome import Genome removeChromIDs = np.array([0, 1, 1, 1, 1] + [0] * 17 + [1] + [0]) #print ((removeChromIDs[TR.chrms1] == 1) + (removeChromIDs[TR.chrms2] == 1) ).sum() t = ((removeChromIDs[TR.chrms1] == 1) * (removeChromIDs[TR.chrms2] == 1)).sum() + ((removeChromIDs[TR.chrms1] == 1) * (TR.chrms2 == -1)).sum() newGenome = Genome(genomePath=genomeFolder, readChrms=["2", "3", "4", "5", "X"]) TR.updateGenome(newGenome) assert TR.N == t a = h5dict(filename[1] + "-1M.hm")["heatmap"]
def showCmapNew(): """Saves a bunch of heatmaps at high resolutions.""" plt.figure(figsize=(8,8)) low = 60000 high = 75000 lowMon = low * 1000 // 600 highMon = high * 1000 // 600 low20 = low // 10 high20 = high // 10 mydict = h5dict("/home/magus/HiC2011/Erez2014/hg19/GM12878_inSitu-all-combined-10k_HighRes.byChr",'r') hicdata = mydict.get_dataset("13 13")[low20:high20, low20:high20] hicdata = completeIC(hicdata) curshape = hicdata.shape resolutionMon = 5 newshape = (1000 * (high - low)) // (600 * resolutionMon) print(hicdata.shape, newshape) hicdata = zoomArray(hicdata, (newshape, newshape)) hicdata = np.clip(hicdata, 0, np.percentile(hicdata, 99.99)) hicdata /= np.mean(np.sum(hicdata, axis=1)) #hicdata = hm / np.mean(np.sum(hm, axis=1)) #for fname in os.listdir("cmaps"): for fname in ["cmapflagshipLifetime300Mu3_r=8.pkl"]: if ("r=8" not in fname) or ("Lifetime" not in fname): print("not going", fname) continue try: mu = float(fname.split("_r=")[0].split("Mu")[1]) except: continue forw, rev = getForwBacv(mu) cmap = pickle.load(open(os.path.join("cmaps", fname), 'rb')) #arr = coarsegrain(cmap, 2) arr = cmap if arr.shape[0] != hicdata.shape[0]: continue arr = arr / np.mean(np.sum(arr,axis=1)) hicdata *= 1.5 diags = 1000 print(arr.shape) ax = plt.subplot(211) turned = pivotHeatmap(arr, diags)[::-1] * 3 turned2 = pivotHeatmap(hicdata, diags) turned = np.concatenate([turned, turned2], axis=0) myextent = [low, high, -(high - low) * diags/ len(arr) , (high - low) * diags/ len(arr) ] plt.imshow(np.log(turned + 0.0001) , aspect=0.5,cmap = "fall", vmax = -4, vmin = -8, extent=myextent , interpolation = "none") #plt.colorbar() #plt.ylim([-(high - low) * diags/ len(arr) , (high - low) * diags/ len(arr) ]) #nicePlot(show=False) plt.subplot(413, sharex = ax) xaxis=np.arange(len(forw)// 20) * 12 + 60000 forwcg = coarsegrain(forw,20) revcg = coarsegrain(rev, 20) plt.vlines(xaxis[forwcg>0], 0, forwcg[forwcg>0], color = "blue") plt.vlines(xaxis[revcg>0], 0, revcg[revcg>0], color = "green") #plt.scatter(xaxis[forwcg>0], forwcg[forwcg>0], label = "forward CTCF") #plt.scatter(xaxis[revcg > 0],revcg[revcg>0], label = "reverse CTCF") plt.xlim([60000, 75000]) plt.title(fname) plt.legend() plt.show() continue #nicePlot(show=False) #plt.subplot(414, sharex = ax) #plt.plot(xaxis, data) #plt.show() #arr = arr / np.mean(np.sum(arr, axis=1)) #ran = np.arange(len(arr)) #mask = ran[:,None] > ran[None,:] #arr[mask] = hicdata[mask] #logarr = np.log(arr + 0.0001) # noinspection PyTypeChecker #plt.imshow(logarr, vmax = np.percentile(logarr, 99.9), extent = [low, high, high, low], interpolation = "none") for st in range(60000, 75000, 1000): for size in [2000, 3000, 5000]: end = st + size if end > 75000: continue plt.xlim([st, end]) plt.savefig(os.path.join("heatmaps", "{0}_st={1}_end={2}_r=2.png".format(fname, st, end))) plt.savefig(os.path.join("heatmaps", "{0}_st={1}_end={2}_r=2.pdf".format(fname, st, end))) plt.clf() plt.show()
def displayHeatmap(): plt.figure(figsize=(5, 5)) shared_arr = mp.Array(ctypes.c_double, N**2) arr = tonumpyarray(shared_arr) arr.shape = (N, N) def doSim(i): nparr = tonumpyarray(shared_arr) SMCTran = initModel(i) for j in range(1): SMC = [] N1 = 10000 for k in range(np.random.randint(N1 // 2, N1)): SMCTran.steps(150) SMC.append(SMCTran.getSMCs()) SMC = np.concatenate(SMC, axis=1) SMC1D = SMC[0] * N + SMC[1] position, counts = np.unique(SMC1D, return_counts=True) with shared_arr.get_lock(): nparr[position] += counts print("Finished!") return None setExceptionHook() low20 = low // 10 high20 = high // 10 mydict = h5dict( "/home/magus/HiC2011/Erez2014/hg19/GM12878_inSitu-all-combined-10k_HighRes.byChr", 'r') hicdata = mydict.get_dataset("13 13")[low20:high20, low20:high20] hicdata = completeIC(hicdata) curshape = hicdata.shape newshape = (1000 * (high - low)) // (600 * 20) print(hicdata.shape, newshape) hicdata = zoomArray(hicdata, (newshape, newshape)) hicdata = np.clip(hicdata, 0, np.percentile(hicdata, 99.99)) hicdata /= np.mean(np.sum(hicdata, axis=1)) fmap(doSim, range(30), n=20) # number of threads to use. On a 20-core machine I use 20. arr = coarsegrain(arr, 20) arr = np.clip(arr, 0, np.percentile(arr, 99.9)) arr /= np.mean(np.sum(arr, axis=1)) ran = np.arange(len(arr)) mask = ran[:, None] > ran[None, :] arr[mask] = hicdata[mask] logarr = np.log(arr + 0.0001) plt.imshow(logarr, vmax=np.percentile(logarr, 99.9), extent=[low, high, high, low], interpolation="none") nicePlot()
def plot_logarr_sites(logarr, forw, rev, title='', cmap='viridis', max_percentile=99.9, extend_factor=0.10, coarsegrain_factor=50, plot_CTCF_lines=True, save_plot=None): '''Plot the log transformed array of extrusion occupancy with a track of the stall sites above the plot. Arguments: logarr: log extrusion occupancy matrix forw: forward array of stall sites rev: reverse array of stall sites Kwargs: title: plot title cmap: plot colormap max_percentile: threshold colormap at this percentile extenc_fator: for simulaion, how much did the region get extended coarsegrain_factor: Binning size for final plot plot_CTCF_lines: plot lines ontop of the matrix for each CTCF site save_plot: if a string, saves the plot at this file ''' # CTCF site information # trim and coarsegrain trim = extend_factor npoints = len(forw) orig_size = int(npoints / (1 + (trim * 2))) remove_each = int((npoints - orig_size) / 2) forwTrim = forw[remove_each:npoints - remove_each] forwTrimC = coarsegrain(forwTrim, coarsegrain_factor) revTrim = rev[remove_each:npoints - remove_each] revTrimC = coarsegrain(revTrim, coarsegrain_factor) forwTrimC[forwTrimC > 1] = 1 revTrimC[revTrimC > 1] = 1 # init plot fig, ax0 = plt.subplots(ncols=1) # set vmax lims use_vmax = np.nanpercentile(logarr, max_percentile) use_vmin = np.nanmin(logarr) # symmetrize logarr keep = logarr[np.triu_indices(logarr.shape[0])] logarr = logarr.transpose() logarr[np.triu_indices(logarr.shape[0])] = keep # set up loarr plot im0 = ax0.matshow(logarr, cmap=cmap, vmax=use_vmax, vmin=use_vmin) ax0.set_title(title, y=1.08) # ctcf site plot should extend 1/4 beyond ctcf_plot_size = int(logarr.shape[0] * 0.25) ax0.set_ylim(logarr.shape[0] - 1, -1 * ctcf_plot_size) fig.colorbar(im0, ax=ax0) # where on the chart we're plotting siteStart = -1 * (ctcf_plot_size / 2) siteExpand = (ctcf_plot_size / 2) * 0.9 # add lines for CTCF sites ax0.vlines([np.where(forwTrimC != 0)], ymin=siteStart, ymax=siteStart - (forwTrimC[np.where(forwTrimC != 0)] * siteExpand), color='red') ax0.vlines([np.where(revTrimC != 0)], ymin=siteStart, ymax=siteStart + (revTrimC[np.where(revTrimC != 0)] * siteExpand), color='blue') if plot_CTCF_lines: # add lines all the way down the plot ax0.vlines([np.where(forwTrimC != 0)], ymin=-20, ymax=[np.where(forwTrimC != 0)], color='magenta', lw=0.5, linestyles='dashed') ax0.vlines([np.where(revTrimC != 0)], ymin=-20, ymax=[np.where(revTrimC != 0)], color='aqua', lw=0.5, linestyles='dashed') ax0.hlines([np.where(forwTrimC != 0)], xmax=logarr.shape[0] - 1, xmin=[np.where(forwTrimC != 0)], color='magenta', lw=0.5, linestyles='dashed') ax0.hlines([np.where(revTrimC != 0)], xmax=logarr.shape[0] - 1, xmin=[np.where(revTrimC != 0)], color='aqua', lw=0.5, linestyles='dashed') plt.tight_layout() if save_plot is not None: plt.savefig(save_plot) else: plt.show(block=False)
im0 = ax0.matshow(logarr, vmax = np.percentile(logarr, 99.9)) ax0.plot([i for i in range(len(arrDI))], [(i*-1)-10 for i in arrDI], lw=0.75) ax0.plot([i for i in range(len(arrDI))], [-10 for i in arrDI], ls='--', color='black', lw=0.5) # fig.colorbar(im0, ax=ax0) ax0.set_title('log extrusion occupancy', y=1.08) # ax0.vlines([i+0.5 for i in range(65)], ymin=-20, ymax=64.5, linestyles='dashed', lw=0.5, color='grey') # fig.colorbar(im0, ax=ax0) ax0.set_ylim(64,-20) # can also plot some ctcf stuff trim = extendFactor npoints = len(forw) orig_size = int(npoints / (1 + (trim * 2))) remove_each = int((npoints - orig_size) / 2) forwTrim = forw[remove_each:npoints - remove_each] forwTrimC = coarsegrain(forwTrim, 50) revTrim = rev[remove_each:npoints - remove_each] revTrimC = coarsegrain(revTrim, 50) siteStart = -10 siteExpand = 9 ax0.vlines([np.where(forwTrimC != 0)], ymin=siteStart, ymax=siteStart - (forwTrimC[np.where(forwTrimC != 0)] * siteExpand), color='red') ax0.vlines([np.where(revTrimC != 0)], ymin=siteStart, ymax=siteStart + (revTrimC[np.where(revTrimC != 0)] * siteExpand), color='blue') plt.show(block=False) doDiCompare(logarr, arr, manyContactAv,'/home/ben/ab_local/directional_model/extrusion_fitting/diCompare/diSites_d') # we can just export this as a file to use with the normal simulation code on Windows saveDf = pd.DataFrame({'chrom' : 'chr21', 'start' : binStarts[contactDIScaled != 0], 'end' : binEnds[contactDIScaled != 0],
def refine_paper(filename, create=True): """filename[0] is a list of filenames of incoming files filename[1] is a folder for outgoing file""" if create == True: for onename in filename[0]: #Parsing individual files if not os.path.exists(onename): raise StandardError("path not found: %s" % onename) TR = HiCdataset("bla", genome=genomeFolder, enzymeName="HindIII", maximumMoleculeLength=500, inMemory=True) print "\nTesting loading new data without rsite information " TR.parseInputData(dictLike=onename, enzymeToFillRsites="HindIII") #assert len(TR.DS) == 856143 #assert len(TR.ufragments) == 634572 TR.save(onename + "_parsed.frag") #Merging files alltogether, applying filters TR = HiCdataset(filename[1] + "_merged.frag", enzymeName="HindIII", genome=genomeFolder, mode="w") TR.merge([i + "_parsed.frag" for i in filename[0]]) TR = HiCdataset("refined", genome=genomeFolder, enzymeName="HindIII", mode="w", inMemory=True) print "\nTesting chunking during all tests" TR.chunksize = 30000 #because we do many operations, we disable autoFlush here TR.load(filename[1] + "_merged.frag") print "\nTesting Rsite filter" TR.filterRsiteStart(offset=5) #assert len(TR.DS) == 832110 print "\nTesting duplicate filter" TR.filterDuplicates(chunkSize=30000) #assert len(TR.DS) == 830275 print "\nTesting small/large and extreme fragment filter" TR.filterLarge() #assert len(TR.DS) == 825442 TR.filterExtreme(cutH=0.005, cutL=0) TR.writeFilteringStats() #assert len(TR.DS) == 803845 #------------------------------------------- TR.printMetadata(saveTo="metadata") import cPickle stop = False mdata = cPickle.load(open("sampleMetadata")) for i in sorted(mdata.keys()): if TR.metadata[i] != mdata[i]: print "Key {0} is not consistent: should be {1}, is {2}".format( i, mdata[i], TR.metadata[i]) stop = True if stop == True: print("""------------_ERROR_-------------- Inconsistent metadata: see above ----------------------------------------""") raise ValueError("Inconsistent Metadata") print "Testing allxall and by-chromosome heatmap counting diagonal twice" print "----> saving allxall heatmap" TR.saveHeatmap(filename[1] + "-1M.hm", 1000000, countDiagonalReads="twice") a = h5dict(filename[1] + "-1M.hm") st, end = TR.genome.chrmStartsBinCont[1], TR.genome.chrmEndsBinCont[1] st2, end2 = TR.genome.chrmStartsBinCont[2], TR.genome.chrmEndsBinCont[2] chrom1 = a["heatmap"][st:end, st:end] chrom12 = a["heatmap"][st:end, st2:end2] setExceptionHook() print "----> saving by chromosome heatmap" TR.saveByChromosomeHeatmap(filename[1] + "-1M.hm", resolution=1000000, includeTrans=True, countDiagonalReads="twice") b = h5dict(filename[1] + "-1M.hm")["1 1"] bb = h5dict(filename[1] + "-1M.hm")["1 2"] assert (b - chrom1).sum() == 0 print "Cis heatmap consistent" assert (bb - chrom12).sum() == 0 print 'Trans heatmap consistent' print a["heatmap"][::10, ::10].sum() #assert a["heatmap"][::10, ::10].sum() == 21800 print "Heatmap sum correct\n" #--------------------------------- print "Testing allxall and by-chromosome heatmap counting diagonal once" TR.saveHeatmap(filename[1] + "-1M.hm", 1000000, countDiagonalReads="once") Ta = h5dict(filename[1] + "-1M.hm") st, end = TR.genome.chrmStartsBinCont[1], TR.genome.chrmEndsBinCont[1] st2, end2 = TR.genome.chrmStartsBinCont[2], TR.genome.chrmEndsBinCont[2] chrom1 = Ta["heatmap"][st:end, st:end] chrom12 = Ta["heatmap"][st:end, st2:end2] setExceptionHook() print "----> saving by chromosome heatmap" TR.saveByChromosomeHeatmap(filename[1] + "-1M-byChr.hm", resolution=1000000, includeTrans=True, countDiagonalReads="once") TR.saveHiResHeatmapWithOverlaps(filename[1] + "-1M-highRes.hm", resolution=50000, countDiagonalReads="twice") TR.saveSuperHighResMapWithOverlaps(filename[1] + "-5k-SuperHighRes.hm", resolution=5000, chromosomes=[14], countDiagonalReads="twice") Tb = h5dict(filename[1] + "-1M-byChr.hm")["1 1"] Tbb = h5dict(filename[1] + "-1M-byChr.hm")["1 2"] assert ((Tb - chrom1) == 0).all() assert ((Tbb - chrom12) == 0).all() assert ((Tb + np.diag(np.diag(Tb))) == b).all() print "Diagonal counting methods are consistent\n" newchrom1 = chrom1.copy() for i in xrange(len(newchrom1)): newchrom1[i, i] = 2 * newchrom1[i, i] Tb = h5dict(filename[1] + "-1M-highRes.hm")["1 1"] assert np.abs(Tb.sum() - newchrom1.sum()) < 1 assert np.sum(np.abs(coarsegrain(Tb, 20, True) - newchrom1)) < 500 #------------------------------ print "Testing updateGenome method" from mirnylib.genome import Genome removeChromIDs = np.array([0, 1, 1, 1, 1] + [0] * 17 + [1] + [0]) #print ((removeChromIDs[TR.chrms1] == 1) + (removeChromIDs[TR.chrms2] == 1) ).sum() t = ((removeChromIDs[TR.chrms1] == 1) * (removeChromIDs[TR.chrms2] == 1)).sum() + ( (removeChromIDs[TR.chrms1] == 1) * (TR.chrms2 == -1)).sum() newGenome = Genome(genomePath=genomeFolder, readChrms=["2", "3", "4", "5", "X"]) TR.updateGenome(newGenome) assert TR.N == t a = h5dict(filename[1] + "-1M.hm")["heatmap"]