def plotCrossValidation(): "main figure subplot with corss-validation" matplotlib.rcParams['font.sans-serif'] = 'Arial' plt.figure(figsize=(1, 1)) FG = HiCdataset(workingFile1, myGenome) FG.load(GMFrag) Tanay = binnedData(1000000) Tanay.simpleLoad("GM-all-10p", "GM-1") #need to create these datasets using fragment-level analysis Tanay.simpleLoad("GM-all-90p", "GM-9") Tanay.removePoorRegions() Tanay.iterativeCorrectWithSS() Tanay.removeZeros() b1, b2 = (Tanay.biasDict["GM-1"], Tanay.biasDict["GM-9"]) cPickle.dump((b1, b2), open("CrossValidatioN", 'wb')) ax = plt.gca() b1, b2 = cPickle.load(open("CrossValidatioN", 'rb')) print cr(b1, b2) plt.scatter(b1, b2, s=.7, color="k", linewidth=0) plt.xlabel(r"10% reads", fontsize=8) plt.ylabel(r"90% reads", fontsize=8) plt.xlim((0, 1.5)) plt.ylim((0, 1.5)) plt.xticks([0, 0.5, 1, 1.5]) plt.yticks([0, 0.5, 1, 1.5]) removeAxes(shift=0) fs = 6 for xlabel_i in ax.get_xticklabels(): xlabel_i.set_fontsize(fs) for xlabel_i in ax.get_yticklabels(): xlabel_i.set_fontsize(fs) plt.show()
def CreateMatrixFile(): BD = binnedData.binnedData(domain_res, genome_db) BD.simpleLoad(heatmap_filepath, 'heatmap') print "Writing file %s with the information \n" % (heatmap_filepath + '.matrix') print "Format:\nChrIndex \t StartBin(Nucleotyde) \t EndBin(Nucl) \t Values\n" f = open(heatmap_filepath + '.matrix', 'w') for i in range(len(BD.dataDict['heatmap'])): strToWrite = "" curChrmIdx = genome_db.chrmIdxBinCont[i] if curChrmIdx == 0: curRelativeBinNumb = i else: curRelativeBinNumb = i - genome_db.chrmLensBin[0:curChrmIdx].sum() strToWrite += str(curChrmIdx) + "\t" + str( genome_db.posBinCont[i]) + "\t" + str( genome_db.posBinCont[i] + genome_db.binSizesBp[curChrmIdx][curRelativeBinNumb]) for j in range(len(BD.dataDict['heatmap'])): strToWrite += "\t" + str(BD.dataDict['heatmap'][i][j]) strToWrite += "\n" f.write(strToWrite) f.close()
def calculateTanayCorrelation( resolution, filename1, filename2, experiment1, experiment2, genome, outfile, mouse=False, **kwargs ): "Calculates correlation between datasets, smoothed in a Tanay way" global pp if options.verbose: print >> sys.stdout, "calculateTanayCorrelation: res: %d file1: %s file2: %s exp1:%s exp2:%s gen:%s" % ( resolution, filename1, filename2, experiment1, experiment2, genome, ) BD = binnedData(resolution, genome) BD.simpleLoad(filename1, experiment1) BD.simpleLoad(filename2, experiment2) def tanaySmooth(matrix): matrix = numpy.array(matrix, dtype="double") a = numpy.arange(-9, 10) mat = 1 / (1.0 + numpy.abs(a[:, None]) + numpy.abs(a[None, :])) return scipy.ndimage.filters.convolve(input=matrix, weights=mat, mode="constant") def propagateSmooth(data): mask1 = numpy.sum(data, axis=0) > 0 mask = mask1[:, None] * mask1[None, :] ret = numpy.zeros_like(data, dtype=float) for i in xrange(BD.genome.chrmCount): for j in xrange(BD.genome.chrmCount): beg1 = BD.chromosomeStarts[i] beg2 = BD.chromosomeStarts[j] end1 = BD.chromosomeEnds[i] end2 = BD.chromosomeEnds[j] mymask = mask[beg1:end1, beg2:end2] d = data[beg1:end1, beg2:end2] toret = tanaySmooth(d) / tanaySmooth(mymask) toret[mymask == 0] = 0 ret[beg1:end1, beg2:end2] = toret return ret BD.removePoorRegions(cutoff=2) BD.removeCis() BD.iterativeCorrectWithoutSS() data1 = BD.dataDict[experiment1] data2 = BD.dataDict[experiment2] mask = (numpy.sum(data1, axis=0) > 0) * (numpy.sum(data2, axis=0) > 0) validMask = mask[:, None] * mask[None, :] transmask = BD.chromosomeIndex[:, None] != BD.chromosomeIndex[None, :] cormask = transmask * validMask d1 = propagateSmooth(data1) d2 = propagateSmooth(data2) (scorr, pvalue) = scipy.stats.spearmanr(d1[cormask], d2[cormask]) outfile.write("Spearman corrleation %s %s %.4f %.4f" % (filename1, filename2, scorr, pvalue))
def iterativeFiltering(genome_db, fragments): ''' Filter the data at the binned level and perform the iterative correction. ''' # Read resolution from the dataset. raw_heatmap = h5dict.h5dict(options.outputDir+'heatmap-res-1M.hdf5', mode='r') resolution = int(raw_heatmap['resolution']) # Create a binnedData object, load the data. BD = binnedData.binnedData(resolution, genome_db) BD.simpleLoad(options.outputDir+'heatmap-res-1M.hdf5', options.experiment) # Remove the contacts between loci located within the same bin. BD.removeDiagonal() # Remove bins with less than half of a bin sequenced. BD.removeBySequencedCount(0.5) # Remove 1% of regions with low coverage. BD.removePoorRegions(cutoff=1) # Truncate top 0.05% of interchromosomal counts (possibly, PCR blowouts). BD.truncTrans(high=0.0005) # Perform iterative correction. BD.iterativeCorrectWithoutSS() # Save the iteratively corrected heatmap. BD.export(options.experiment, options.outputDir+'IC-heatmap-res-1M.hdf5') plotting.plot_matrix(np.log(BD.dataDict[options.experiment]))
def step3(hiclib_path, sraid, res=1000000): ''' 3. Filter and iteratively correct heatmaps. http://mirnylab.bitbucket.org/hiclib/tutorial/03_heatmap_processing.html ''' import matplotlib.pyplot as plt import numpy as np from mirnylib import genome from mirnylib import h5dict from mirnylib import plotting from hiclib import binnedData genome_db = genome.Genome(hiclib_path+'/fasta/hg19', readChrms=['#', 'X']) # Read resolution from the dataset. raw_heatmap = h5dict.h5dict(sraid+'_map-res%sk.hdf5'%(res/1000), mode='r') resolution = int(raw_heatmap['resolution']) # Create a binnedData object, load the data. BD = binnedData.binnedData(resolution, genome_db) BD.simpleLoad(sraid+'_map-res%sk.hdf5'%(res/1000), 'DataName') # Plot the heatmap directly. plotting.plot_matrix(np.log(BD.dataDict['DataName'])) plt.savefig(sraid+'_map-res%sk.pdf'%(res/1000)) plt.clf() # Remove the contacts between loci located within the same bin. BD.removeDiagonal() # Remove bins with less than half of a bin sequenced. BD.removeBySequencedCount(0.5) # Remove 1% of regions with low coverage. BD.removePoorRegions(cutoff=1) # Truncate top 0.05% of interchromosomal counts (possibly, PCR blowouts). BD.truncTrans(high=0.0005) # Perform iterative correction. BD.iterativeCorrectWithoutSS() # Save the iteratively corrected heatmap. BD.export('DataName', sraid+'_map-res%sk-ic.hdf5'%(res/1000)) # Plot the heatmap directly. plotting.plot_matrix(np.log(BD.dataDict['DataName'])) plt.savefig(sraid+'_map-res%sk-ic.pdf'%(res/1000)) plt.clf() # Save Bias outfile = open(sraid+"_map-res%sk-ic-bias.txt"%(res/1000), "w") for i in xrange(len(BD.chromosomeIndex)): chro = BD.genome.idx2label[BD.chromosomeIndex[i]] posi = BD.positionIndex[i] outfile.write("chr%s\t%s\t%s"%(chro, posi, posi+res)) outfile.write("\t%s"%BD.biasDict['DataName'][i]) outfile.write("\n") outfile.close()
def get_chromosomes(hm_file, genome_db, resolution, chrNumb=None): if extractResolutionFromFileName(hm_file) != resolution: print "WARNING! Provided resolution ", resolution, "does not match ", extractResolutionFromFileName( hm_file), "extracted from file name ", hm_file if "hiRes.hm" in hm_file: type = "HiRes" elif "bychr.hm" in hm_file: type = "bychr" else: print "Warning: cannot resolve type of data from filename" try: print "Warning: trying hires hic" raw_heatmap = h5dict.h5dict(fname, mode='r') #open heatmap if "0 0" in raw_heatmap.keys(): type = "HiRes" else: print "HiRes hic Failed! Assuming bychr type" type = "bychr" except: print "HiRes hic Failed! Assuming bychr type" type = "bychr" if type == "HiRes": from hiclib import highResBinnedData # Create a object, load the data. print "creating an object" hmap = highResBinnedData.HiResHiC(genome_db, resolution) print "loading data" hmap.loadData(hm_file, mode="cis") print "Data loaded" if chrNumb != None: return hmap.data[(chrNumb, chrNumb)].getData() return [ hmap.data[(i, i)].getData() for i in xrange(genome_db.chrmCount) ] #cisKeys are tuples like (N,N) where N is 0..Number_of_chrms-1 elif type == "bychr": from hiclib import binnedData print "creating an object" hmap = binnedData.binnedData(resolution, genome_db) print "loading data" hmap.simpleLoad(hm_file, "heatmap") data = hmap.dataDict["heatmap"] assert len(data) == genome_db.numBins print "Data loaded" if chrNumb != None: return data[genome_db.chrmStartsBinCont[chrNumb]:genome_db. chrmEndsBinCont[chrNumb], genome_db.chrmStartsBinCont[chrNumb]:genome_db. chrmEndsBinCont[chrNumb]] return [ data[genome_db.chrmStartsBinCont[i]:genome_db.chrmEndsBinCont[i], genome_db.chrmStartsBinCont[i]:genome_db.chrmEndsBinCont[i]] for i in xrange(genome_db.chrmCount) ] else: raise "Error: can not recognize heatmap format from file name"
def step4(hiclib_path, sraid, res=1000000): ''' 4. Eigen vector decomposition /examples/iterativeCorrectionEigenvectorExpansion/eigenvectorAnalysis.py ''' import matplotlib.pyplot as plt import numpy as np from mirnylib import genome from mirnylib import h5dict from mirnylib import plotting from hiclib import binnedData genome_db = genome.Genome(hiclib_path + '/fasta/hg19', readChrms=['#', 'X']) # Read resolution from the dataset. raw_heatmap = h5dict.h5dict(sraid + '_map-res%sk.hdf5' % (res / 1000), mode='r') resolution = int(raw_heatmap['resolution']) # Create a binnedData object, load the data. BD = binnedData.binnedData(resolution, genome_db) BD.simpleLoad(sraid + '_map-res%sk.hdf5' % (res / 1000), 'DataName') # Do eigen decomposition BD.removeDiagonal() BD.removeBySequencedCount(0.5) BD.removeCis() BD.truncTrans(high=0.0005) BD.removePoorRegions(cutoff=1) BD.fakeCis() BD.removeZeros() BD.doEig(numPCs=30, force=True) ## First 30 EIGs BD.restoreZeros(value=0) eig = BD.eigEigenvalueDict['DataName'] eig_v = BD.EigDict['DataName'] # Plot the heatmap directly. plotting.plot_matrix(np.log(np.dot(np.dot(eig_v.T, np.diag(eig)), eig_v))) plt.savefig(sraid + '_map-res%sk-eig.pdf' % (res / 1000)) plt.clf() outfile = open(sraid + "_map-res%sk-ic-eig.txt" % (res / 1000), "w") for i in xrange(len(BD.chromosomeIndex)): chro = BD.genome.idx2label[BD.chromosomeIndex[i]] posi = BD.positionIndex[i] outfile.write("chr%s\t%s\t%s" % (chro, posi, posi + res)) for eigenvector in eig_v: outfile.write("\t%s" % eigenvector[i]) outfile.write("\n") outfile.close()
def step4(hiclib_path, sraid, res=1000000): ''' 4. Eigen vector decomposition /examples/iterativeCorrectionEigenvectorExpansion/eigenvectorAnalysis.py ''' import matplotlib.pyplot as plt import numpy as np from mirnylib import genome from mirnylib import h5dict from mirnylib import plotting from hiclib import binnedData genome_db = genome.Genome(hiclib_path+'/fasta/hg19', readChrms=['#', 'X']) # Read resolution from the dataset. raw_heatmap = h5dict.h5dict(sraid+'_map-res%sk.hdf5'%(res/1000), mode='r') resolution = int(raw_heatmap['resolution']) # Create a binnedData object, load the data. BD = binnedData.binnedData(resolution, genome_db) BD.simpleLoad(sraid+'_map-res%sk.hdf5'%(res/1000), 'DataName') # Do eigen decomposition BD.removeDiagonal() BD.removeBySequencedCount(0.5) BD.removeCis() BD.truncTrans(high=0.0005) BD.removePoorRegions(cutoff=1) BD.fakeCis() BD.removeZeros() BD.doEig(numPCs=30, force=True) ## First 30 EIGs BD.restoreZeros(value=0) eig = BD.eigEigenvalueDict['DataName'] eig_v = BD.EigDict['DataName'] # Plot the heatmap directly. plotting.plot_matrix(np.log(np.dot(np.dot(eig_v.T, np.diag(eig)), eig_v))) plt.savefig(sraid+'_map-res%sk-eig.pdf'%(res/1000)) plt.clf() outfile = open(sraid+"_map-res%sk-ic-eig.txt"%(res/1000), "w") for i in xrange(len(BD.chromosomeIndex)): chro = BD.genome.idx2label[BD.chromosomeIndex[i]] posi = BD.positionIndex[i] outfile.write("chr%s\t%s\t%s"%(chro, posi, posi+res)) for eigenvector in eig_v: outfile.write("\t%s"%eigenvector[i]) outfile.write("\n") outfile.close()
def calculateTanayCorrelation(): "Calculates correlation between datasets, smoothed in a Tanay way" BD = binnedData(1000000, "../../../data/hg18") BD.simpleLoad("../../../ErezPaperData/hg18/GM-HindIII-hg18-1M.hm", "HindIII") BD.simpleLoad("../../../ErezPaperData/hg18/GM-NcoI-hg18-1M.hm", "NcoI") def tanaySmooth(matrix): matrix = numpy.array(matrix, dtype="double") a = numpy.arange(-9, 10) mat = 1 / (1. + numpy.abs(a[:, None]) + numpy.abs(a[None, :])) return scipy.ndimage.filters.convolve(input=matrix, weights=mat, mode="constant") def propagateSmooth(data): mask1 = numpy.sum(data, axis=0) > 0 mask = mask1[:, None] * mask1[None, :] ret = numpy.zeros_like(data, dtype=float) for i in xrange(BD.genome.chrmCount): for j in xrange(BD.genome.chrmCount): beg1 = BD.chromosomeStarts[i] beg2 = BD.chromosomeStarts[j] end1 = BD.chromosomeEnds[i] end2 = BD.chromosomeEnds[j] mymask = mask[beg1:end1, beg2:end2] d = data[beg1:end1, beg2:end2] toret = tanaySmooth(d) / tanaySmooth(mymask) toret[mymask == 0] = 0 ret[beg1:end1, beg2:end2] = toret return ret BD.removePoorRegions(cutoff=2) BD.removeCis() BD.iterativeCorrectWithoutSS() data1 = BD.dataDict["HindIII"] data2 = BD.dataDict["NcoI"] mask = (numpy.sum(data1, axis=0) > 0) * (numpy.sum(data2, axis=0) > 0) validMask = mask[:, None] * mask[None, :] transmask = BD.chromosomeIndex[:, None] != BD.chromosomeIndex[None, :] cormask = transmask * validMask d1 = propagateSmooth(data1) d2 = propagateSmooth(data2) print scipy.stats.spearmanr(d1[cormask], d2[cormask])
def saddlePlot(): "plot of values ordered by Eig1GW" #plt.figure(figsize = (1.5,1.5)) plt.figure(figsize=(3, 3)) Tanay = binnedData(1000000) Tanay.simpleLoad("../data/GM-all-hg18-1M", "GM-all") Tanay.removeDiagonal(1) Tanay.removePoorRegions() Tanay.removeZeros() Tanay.fakeCis() Tanay.iterativeCorrectWithoutSS() Tanay.doEig() PC = Tanay.EIG["GM-all"][:, 0] if PC[0] > 0: PC = -PC def reorder(data, array=PC): inds = numpy.argsort(array) ndata = data[inds, :] return ndata[:, inds] toplot = (coarsegrain(reorder(Tanay.dataDict["GM-all"]), 60)) toplot /= toplot.mean() toplot = numpy.log(toplot) sh = toplot.shape toplot = toplot.reshape((-1)) ag = numpy.argmax(toplot) toplot[ag] = 0 toplot[ag] = numpy.max(toplot) toplot.shape = sh toplot[0, -1] = toplot[0, -2] toplot[-1, 0] = toplot[-2, 0] plt.imshow(toplot, vmin=toplot.min(), vmax=toplot.max(), interpolation="nearest") cbar = plt.colorbar(orientation="vertical") #labels = ["10","100","1000","10000"] #cbar.ax.set_xticklabels(labels) cbar.ax.set_xlabel("Log(relative contact probability)", fontsize=6) for xlabel_i in cbar.ax.get_xticklabels(): xlabel_i.set_fontsize(6) cbar.set_ticks([-0.5, 0, 0.5, 1]) removeBorder() mirnylib.plotting.niceShow()
def iterativeFiltering(genome_db, filesuffix): ''' Filter the data at the binned level and perform the iterative correction. ''' # Read resolution from the dataset. raw_heatmap = h5dict.h5dict(options.outputDir + options.experiment + filesuffix, mode='r') resolution = int(raw_heatmap['resolution']) # Create a binnedData object, load the data. BD = binnedData.binnedData(resolution, genome_db) BD.simpleLoad(options.outputDir + options.experiment + filesuffix, options.experiment) # Remove the contacts between loci located within the same bin. BD.removeDiagonal() # Remove bins with less than half of a bin sequenced. BD.removeBySequencedCount(0.5) # Remove 1% of regions with low coverage. BD.removePoorRegions(cutoff=1) # Truncate top 0.05% of interchromosomal counts (possibly, PCR blowouts). BD.truncTrans(high=0.0005) # Remove empty bins BD.removeZeros() # Perform iterative correction. BD.iterativeCorrectWithoutSS() # Save the iteratively corrected heatmap. BD.export(options.experiment, options.outputDir + options.experiment + '-IC' + filesuffix) plt.figure() plotting.plot_matrix(np.log(BD.dataDict[options.experiment])) pp.savefig()
def doEigenvector(filename, genome): if filename == "GC": gen = Genome("/home/magus/HiC2011/data/" + genome, readChrms=["#","X"]) gen.setResolution(1000000) GC = np.concatenate(gen.GCBin) return GC resolution = getResolution(filename) BD = binnedData.binnedData(resolution, "/home/magus/HiC2011/data/" + genome, ["#","X"]) BD.simpleLoad(filename, "bla") BD.removeDiagonal() BD.removeBySequencedCount(0.5) BD.removeCis() BD.truncTrans(high=0.0005) BD.removePoorRegions(cutoff=1) BD.fakeCis() BD.removeZeros() BD.doEig(numPCs=2) BD.restoreZeros(value=0) return BD.EigDict["bla"][0]
def CorrectHeatMap(): # Read resolution from the dataset. print "Loading raw heatmap\n" raw_heatmap = h5dict.h5dict(heatmap_filepath + '-raw', mode='r') resolution = int(raw_heatmap['resolution']) ####### Set resolution for genome #genome_db.setResolution(resolution) # Create a binnedData object, load the data. BD = binnedData.binnedData(resolution, genome_db) BD.simpleLoad(heatmap_filepath + '-raw', 'HindIII_GM_1') # Remove the contacts between loci located within the same bin. BD.removeDiagonal() # Remove bins with less than half of a bin sequenced. BD.removeBySequencedCount(0.5) # Remove 1% of regions with low coverage. BD.removePoorRegions(cutoff=1) # Truncate top 0.05% of interchromosomal counts (possibly, PCR blowouts). BD.truncTrans(high=0.0005) # Perform iterative correction. BD.iterativeCorrectWithoutSS() # Save the iteratively corrected heatmap. BD.export('HindIII_GM_1', heatmap_filepath)
if not os.path.exists(inDataset): raise IOError("Raw heatmap file does not exist: {}".format(inDataset)) if not os.path.isdir(genomeFolder): raise IOError("Genome folder does not exist") # When you do this, be sure that readChrms used to save heatmap matches # readChrms that you define here! genome = Genome(genomeFolder, readChrms=readChrms) # Read resolution from one of the datasets sampleDataset = h5dict(inDatasets.values()[0], mode="r") # random dataset resolution = int(sampleDataset["resolution"]) # Define the binnedData object, load data BD = binnedData(resolution, genome, readChrms) for name, filename in inDatasets.items(): BD.simpleLoad(filename, name) BD.removeDiagonal() # Remove bins with less than half of a bin sequenced BD.removeBySequencedCount(0.5) # Remove 1% of regions with low coverage BD.removePoorRegions(cutoff=1) # Truncate top 0.05% of interchromosomal counts (possibly, PCR blowouts) BD.truncTrans(high=0.0005) # Actually performe iterative correction
source = os.path.join(HiCFolder, sys.argv[1]) # file after calling runHiC binning outfile = sys.argv[2] # params from source lib = h5dict(source, 'r') res = lib['resolution'] gInfo = lib['genomeInformation'] genomeFolder = os.path.join(gInfo['dataFolder'], gInfo['genomeName']) # Load binned data and perform PCA ... genome_db = myGenome(genomeFolder, readChrms=gInfo['chroms'], chrmFileTemplate=gInfo['template'], gapFile=gInfo['gapFile']) BD = binnedData.binnedData(res, genome_db) name = os.path.split(source)[1].split('-')[0] BD.simpleLoad(source, name) BD.doCisPCADomains(3) # Identify compartments ... pcas = BD.PCDict[name].T idx2label = gInfo['idx2label'] for i in idx2label: label = idx2label[i] mask = BD.chromosomeIndex == i tmp = pcas[mask] compartments = compartmentFromPCA(tmp[:, 0], res, label) with open(outfile, 'a') as output: for c in compartments: line = '\t'.join(map(str, c)) + '\n' output.write(line)
base_folder = '/mnt/storage/home/vsfishman/HiC/data/' base_filename = 'ESC_full' heatmap_filepath = base_folder + 'heatmap-res-' + str( domain_res / 1000) + 'KB_' + base_filename + '.hdf5' raw_heatmap_filepath = base_folder + 'heatmap-res-' + str( domain_res / 1000) + 'KB_' + base_filename + '.hdf5' maped_reads_filepath = base_folder + 'mapped_reads_' + base_filename + '.hdf5' figure_path = base_folder + base_filename + "_" + str( domain_res / 1000) + 'kb-Xist.png' genome_fai_filepath = '../../fasta/' + genome_name + '/' + genome_name + '.fai' print "Loading file " + heatmap_filepath BD = binnedData.binnedData(domain_res, genome_db) BD.simpleLoad(heatmap_filepath, 'HindIII_GM_1') BD_raw = binnedData.binnedData(domain_res, genome_db) BD_raw.simpleLoad(heatmap_filepath, 'HindIII_GM_1') q = BD.dataDict['HindIII_GM_1'] q_raw = BD_raw.dataDict['HindIII_GM_1'] X_values = [] Y_values = [] Y_errors = [] binnumber = 100 dist = -1 start = sum(genome_db.chrmLensBin[0:19])
def plotDiagonalCorrelation(resolution, filename1, filename2, experiment1, experiment2, genome, mouse=False, **kwargs): "Correlation of diagonal bins - paper figure" global pp if options.verbose: print >> sys.stdout, "plotDiagonalCorrelation: res: %d file1: %s file2: %s exp1:%s exp2:%s gen:%s" % ( resolution, filename1, filename2, experiment1, experiment2, genome, ) S = 50 x = numpy.arange(2, S) Tanay = binnedData(resolution, genome) Tanay.simpleLoad(filename1, experiment1) Tanay.simpleLoad(filename2, experiment2) Tanay.removeDiagonal(1) Tanay.removePoorRegions() Tanay.removeZeros() pairs = [(experiment1, experiment2)] cors = [[] for _ in pairs] for i in x: for j, pair in enumerate(pairs): cors[j].append( cr(numpy.diagonal(Tanay.dataDict[pair[0]], i), numpy.diagonal(Tanay.dataDict[pair[1]], i))[0] ) Tanay.iterativeCorrectWithoutSS(M=1) cors2 = [[] for _ in pairs] for i in x: for j, pair in enumerate(pairs): cors2[j].append( cr(numpy.diagonal(Tanay.dataDict[pair[0]], i), numpy.diagonal(Tanay.dataDict[pair[1]], i))[0] ) Tanay.iterativeCorrectWithoutSS(M=20) cors3 = [[] for _ in pairs] for i in x: for j, pair in enumerate(pairs): cors3[j].append( cr(numpy.diagonal(Tanay.dataDict[pair[0]], i), numpy.diagonal(Tanay.dataDict[pair[1]], i))[0] ) matplotlib.rcParams["font.sans-serif"] = "Arial" print "Eigenvectors" print cors print cors2 print cors3 plt.figure(figsize=(8, 4)) ax = plt.gca() for j, pair in enumerate(pairs): plt.subplot(1, len(pairs), j) fs = 8 for xlabel_i in ax.get_xticklabels(): xlabel_i.set_fontsize(fs) for xlabel_i in ax.get_yticklabels(): xlabel_i.set_fontsize(fs) plt.title("%s vs %s" % pair) plt.plot(x / 5.0, cors3[j], color="#E5A826", label="Iterative") plt.plot(x / 5.0, cors2[j], color="#28459A", label="Single") plt.plot(x / 5.0, cors[j], color="#E55726", label="Raw") plt.xlabel("Genomic Separation, MB", fontsize=8) plt.ylabel("Spearman correlation", fontsize=8) plt.legend() legend = plt.legend(prop={"size": 6}, loc=9, handlelength=2) legend.draw_frame(False) plt.ylim((0, 1)) removeAxes(shift=0) plt.show() pp.savefig()
from mirnylib.genome import Genome from hiclib.binnedData import binnedData from mirnylib.h5dict import h5dict import numpy as np import sys import os genome = Genome(sys.argv[1], readChrms=["1", "2", "3", "4", "5"]) a = HiResHiC(genome, 1000000, "hiResDict", mode='w') a.loadData(dictLike="../fragmentHiC/test-1M-byChr.hm") a.removeDiagonal() a.removePoorRegions(2) a.iterativeCorrection(1e-10) b = binnedData(1000000, genome) data = {"heatmap": h5dict("../fragmentHiC/test-1M.hm")["heatmap"]} lim = b.genome.chrmEndsBinCont[-1] data["heatmap"] = data["heatmap"][:lim, :lim] b.simpleLoad(data, "data") b.removeDiagonal() b.removePoorRegions(cutoff=2) b.iterativeCorrectWithoutSS(tolerance=1e-10) a.export("testExport") def compareData(): dataHigh = a.getCombinedMatrix() dataLow = b.dataDict["data"]
diags={} for i in filenames.keys(): print "Reading file "+i # if (i.split('.')[-1]=='hdf5'): if True: if (resolution==0): #if we do not know resolution raw_heatmap = h5dict.h5dict(i, mode='r') #open heatmap resolution = int(raw_heatmap['resolution']) #get the resolution del raw_heatmap #close heatmap if (genome_db==None): #if we have not initilaize genome before genome_db = genome.Genome("/mnt/storage/home/vsfishman/HiC/fasta/GalGal5/GCF_000002315.4_Gallus_gallus-5.0_assembly_structure/Primary_Assembly/galGal5_all_contigs.filtered/", readChrms=[], chrmFileTemplate="N%s.fa") BD = binnedData.binnedData(resolution, genome_db) #now we can initialyze heatmap with defined resolution and genome BD.simpleLoad(i, 'heatmap') number_of_bins=len(BD.dataDict['heatmap']) diags[i]=np.zeros(max(genome_db.chrmLensBin)) for chr in xrange(genome_db.chrmCount): for j in xrange(genome_db.chrmLensBin[chr]): cur_chr_matrix = BD.dataDict['heatmap'][genome_db.chrmStartsBinCont[chr]:genome_db.chrmEndsBinCont[chr],genome_db.chrmStartsBinCont[chr]:genome_db.chrmEndsBinCont[chr]] diags[i][j] += sum(np.diag(cur_chr_matrix,j))*2 print np.sum(diags[i]) diags[i] = (diags[i]/np.sum(diags[i]))*100.0 print (diags[i][0:10])/100.0 del BD def contact_freq_total():
def calculateTanayCorrelation(resolution, filename1, filename2, experiment1, experiment2, genome, outfile, mouse=False, **kwargs): "Calculates correlation between datasets, smoothed in a Tanay way" global pp if (options.verbose): print >> sys.stdout, "calculateTanayCorrelation: res: %d file1: %s file2: %s exp1:%s exp2:%s gen:%s" % ( resolution, filename1, filename2, experiment1, experiment2, genome) BD = binnedData(resolution, genome) BD.simpleLoad(filename1, experiment1) BD.simpleLoad(filename2, experiment2) def tanaySmooth(matrix): matrix = numpy.array(matrix, dtype="double") a = numpy.arange(-9, 10) mat = 1 / (1. + numpy.abs(a[:, None]) + numpy.abs(a[None, :])) return scipy.ndimage.filters.convolve(input=matrix, weights=mat, mode="constant") def propagateSmooth(data): mask1 = numpy.sum(data, axis=0) > 0 mask = mask1[:, None] * mask1[None, :] ret = numpy.zeros_like(data, dtype=float) for i in xrange(BD.genome.chrmCount): for j in xrange(BD.genome.chrmCount): beg1 = BD.chromosomeStarts[i] beg2 = BD.chromosomeStarts[j] end1 = BD.chromosomeEnds[i] end2 = BD.chromosomeEnds[j] mymask = mask[beg1:end1, beg2:end2] d = data[beg1:end1, beg2:end2] toret = tanaySmooth(d) / tanaySmooth(mymask) toret[mymask == 0] = 0 ret[beg1:end1, beg2:end2] = toret return ret BD.removePoorRegions(cutoff=2) BD.removeCis() BD.iterativeCorrectWithoutSS() data1 = BD.dataDict[experiment1] data2 = BD.dataDict[experiment2] mask = (numpy.sum(data1, axis=0) > 0) * (numpy.sum(data2, axis=0) > 0) validMask = mask[:, None] * mask[None, :] transmask = BD.chromosomeIndex[:, None] != BD.chromosomeIndex[None, :] cormask = transmask * validMask d1 = propagateSmooth(data1) d2 = propagateSmooth(data2) (scorr, pvalue) = scipy.stats.spearmanr(d1[cormask], d2[cormask]) outfile.write("Spearman corrleation %s %s %.4f %.4f" % (filename1, filename2, scorr, pvalue))
#IMPORTANT: use iter-corrected heatmaps here. Otherwise, take care about adjustment of total reads number when calculating mask_hugeDifference heatmap_filepath1=sys.argv[1] heatmap_filepath2=sys.argv[2] #out_heatmap_filepath2 = base_folder+'heatmap-res-'+str(domain_res/1000)+'KB_'+base_filename2+'_compressed_as_'+base_filename1+'.hdf5' #figure_path = base_folder+'heatmap-res-'+str(domain_res/1000)+'KB_'+base_filename2+'_compressed_as_'+base_filename1+'.png' print "Loading file "+heatmap_filepath1 raw_heatmap = h5dict.h5dict(heatmap_filepath1, mode='r') res = int(raw_heatmap['resolution']) print "resolution defined by heatmap: ",res BD1 = binnedData.binnedData(res, genome_db1) BD1.simpleLoad(heatmap_filepath1, 'heatmap') print "Loading file "+heatmap_filepath2 BD2 = binnedData.binnedData(res, genome_db2) BD2.simpleLoad(heatmap_filepath2, 'heatmap') q1=BD1.dataDict['heatmap'] q2=BD2.dataDict['heatmap'] #-----DEBUG------ #print "Plotting contact matrix" #plotting.plot_matrix(np.log(q2)) #plt.subplots_adjust(bottom=0.15) #print "Saving figure "+figure_path+'tmp.png'
def filter_bychr_heatmap(hm_file): resolution = extractResolutionFromFileName(hm_file) if resolution == None: raise from hiclib import binnedData # Create a object, load the data. print "creating an object" hmap = binnedData.binnedData(resolution, genome_db) print "loading data" hmap.simpleLoad(hm_file, "heatmap") print "saving pict of heatmap" import matplotlib matplotlib.use('Agg') import matplotlib.pyplot as plt from mirnylib import plotting maxlen = min(10000, len(hmap.dataDict["heatmap"])) a = hmap.dataDict["heatmap"][0:maxlen, 0:maxlen] figure_path = hm_file + "stage1.png" print "saving ", figure_path plotting.plot_matrix(np.log(a)) plt.subplots_adjust(bottom=0.15) f = open(figure_path, "wb") plt.savefig(figure_path, dpi=600) f.close() plt.clf() # Remove the contacts between loci located within the same bin +/- 1 bin. hmap.removeDiagonal(m=1) hmap.removeBySequencedCount( ) # new filter: omit all bins with less than 0.5 coverage by sequenced bases (i.e. bases present in the genome) hmap.removePoorRegions( cutoff=0.5, coverage=True ) # remove .5% bins with the lowest number of records (i.e. non-zero entrees in the matrix) # This filter was updated to remove bins which have zero contacts and one PCR blowout. Those bins would have many reads, but all reads will be with one or few other bins. hmap.truncTrans() # remove PCR blowouts from trans data a = hmap.dataDict["heatmap"][0:maxlen, 0:maxlen] figure_path = hm_file + "stage2.png" print "saving ", figure_path plotting.plot_matrix(np.log(a)) plt.subplots_adjust(bottom=0.15) f = open(figure_path, "wb") plt.savefig(figure_path, dpi=200) f.close() plt.clf() hmap.iterativeCorrectWithoutSS(force=True) #do iterative correction a = hmap.dataDict["heatmap"][0:maxlen, 0:maxlen] figure_path = hm_file + "stage3.png" print "saving ", figure_path plotting.plot_matrix(np.log(a)) plt.subplots_adjust(bottom=0.15) f = open(figure_path, "wb") plt.savefig(figure_path, dpi=600) f.close() plt.clf() # Save the iteratively corrected heatmap. hmap.export("heatmap", hm_file + ".IC.hdf5", False)
def plotDiagonalCorrelation(): "Correlation of diagonal bins - paper figure" S = 50 x = numpy.arange(2, S) Tanay = binnedData(200000, myGenome) Tanay.simpleLoad(GM200k, "GM-HindIII") Tanay.simpleLoad(GM200kNcoI, "GM-NcoI") Tanay.simpleLoad(tcc200k, "TCC") Tanay.removeDiagonal(1) Tanay.removePoorRegions() Tanay.removeZeros() pairs = [("GM-HindIII", "GM-NcoI"), ("GM-HindIII", "TCC"), ( "GM-NcoI", "TCC")] cors = [[] for _ in pairs] for i in x: for j, pair in enumerate(pairs): cors[j].append(cr( numpy.diagonal(Tanay.dataDict[pair[0]], i), numpy.diagonal(Tanay.dataDict[pair[1]], i) )[0]) Tanay.iterativeCorrectWithoutSS(M=1) cors2 = [[] for _ in pairs] for i in x: for j, pair in enumerate(pairs): cors2[j].append(cr( numpy.diagonal(Tanay.dataDict[pair[0]], i), numpy.diagonal(Tanay.dataDict[pair[1]], i) )[0]) Tanay.iterativeCorrectWithoutSS(M=20) cors3 = [[] for _ in pairs] for i in x: for j, pair in enumerate(pairs): cors3[j].append(cr( numpy.diagonal(Tanay.dataDict[pair[0]], i), numpy.diagonal(Tanay.dataDict[pair[1]], i) )[0]) matplotlib.rcParams['font.sans-serif'] = 'Arial' #plt.figure(figsize = (2.3,1.8)) print cors print cors2 print cors3 plt.figure(figsize=(10, 3)) ax = plt.gca() for j, pair in enumerate(pairs): plt.subplot(1, len(pairs), j) fs = 8 for xlabel_i in ax.get_xticklabels(): xlabel_i.set_fontsize(fs) for xlabel_i in ax.get_yticklabels(): xlabel_i.set_fontsize(fs) plt.title("%s vs %s" % pair) plt.plot(x / 5., cors3[j], color="#E5A826", label="Iterative") plt.plot(x / 5., cors2[j], color="#28459A", label="Single") plt.plot(x / 5., cors[j], color="#E55726", label="Raw") plt.xlabel("Genomic Separation, MB", fontsize=8) plt.ylabel("Spearman correlation", fontsize=8) plt.legend() legend = plt.legend(prop={"size": 6}, loc=9, handlelength=2) legend.draw_frame(False) plt.ylim((0, 1)) removeAxes(shift=0) plt.show()
def plot_Contact_drop_depending_on_distance_to_border( domains, distance=1000000, bands_binned=[2, 4, 6, 8, 10], colors=["red", "green", "blue", "black", "yellow"]): #Contact drop depending on distance to border if bstrap: print "This function is not designed for bstrap mode" print "Skipping function" return print "Contact drop depending on distance to border" raw_heatmap = h5dict.h5dict(hmap, mode='r') res = int(raw_heatmap['resolution']) print "resolution defined by heatmap: ", res BD = binnedData.binnedData(res, genome_db) print datetime.datetime.now(), " loading hmap" BD.simpleLoad(hmap, 'heatmap') data = BD.dataDict["heatmap"] distance_binned = distance / res result = {} for ind, band_binned in enumerate(bands_binned): result[band_binned] = {} for distance in range(-distance_binned, distance_binned + 1): result[band_binned][distance] = {} result[band_binned][distance]["left"] = [] result[band_binned][distance]["right"] = [] result[band_binned][distance]["center"] = [] for domain in domains: chrm = genome_db.label2idx[domain["chrm"]] start = int(round(domain["start"] / float(res))) + distance - (band_binned / 2) if (start >= 0 ) and (start + band_binned) < genome_db.chrmLensBin[chrm]: start = sum(genome_db.chrmLensBin[0:chrm]) + start end = start + band_binned result[band_binned][distance]["left"].append(data[start, end]) start = int(round( domain["end"] / float(res))) + distance - (band_binned / 2) if (start >= 0 ) and (start + band_binned) < genome_db.chrmLensBin[chrm]: start = sum(genome_db.chrmLensBin[0:chrm]) + start end = start + band_binned result[band_binned][distance]["right"].append(data[start, end]) domain_length = domain["end"] - domain["start"] assert domain_length > 0 start = int( round((domain["start"] + domain_length / 2.) / float(res))) + distance - (band_binned / 2) if (start >= 0 ) and (start + band_binned) < genome_db.chrmLensBin[chrm]: start = sum(genome_db.chrmLensBin[0:chrm]) + start end = start + band_binned result[band_binned][distance]["center"].append(data[start, end]) result[band_binned][distance]["left"] = np.average( result[band_binned][distance]["left"]) result[band_binned][distance]["right"] = np.average( result[band_binned][distance]["right"]) result[band_binned][distance]["center"] = np.average( result[band_binned][distance]["center"]) print datetime.datetime.now(), " Saving pictures" for pos in ["left", "right", "center"]: for ind, band_binned in enumerate(bands_binned): X = [x * res for x in sorted(result[band_binned].keys())] Y = [ result[band_binned][x][pos] for x in sorted(result[band_binned].keys()) ] plt.plot(X, Y, label="band=" + str(band_binned * res), color=colors[ind], marker="o") plt.ylim(ymin=0, ymax=250) plt.legend(fontsize="xx-small") plt.savefig(hmap + "_" + domains_file.split("/")[-1] + ".contact_drop_on_domains_border_" + pos + ".png", dpi=300) plt.clf() print datetime.datetime.now(), " Done"
fnames = ["fname1","fname2"] names = ["dataset 1", "dataset 2"] exportnames = ["fname1_corrected","fname2_corrected"] resolution = 500000 genFolder = "/folder/to/the/genome/files/and/gap.txt/file/according/to/the/mirnylib.genome/class" #for one file it would be fnames = ["myfile.hm"] resolution = 500000 names = ["whatever"] exportnames = ["filename_corrected"] genFolder = "genomeFolder" a = binnedData.binnedData(resolution,genFolder) #folder should be openable by mirnylib.genome for name,fname,exportname in zip(names,fnames,exportnames): a.simpleLoad(fname, name) a.removeDiagonal() #we never ever use diagonal a.removeBySequencedCount() # new filter: omit all bins with less than 0.5 coverage by sequenced bases (i.e. bases present in the genome) a.removePoorRegions(cutoff = 0.5, coverage=True) # remove .5% bins with the lowest number of records (i.e. non-zero entrees in the matrix) # This filter was updated to remove bins which have zero contacts and one PCR blowout. Those bins would have many reads, but all reads will be with one or few other bins. a.removePoorRegions(cutoff = 0.5, coverage=False) # standart filter. Cutoff reduced to 0.5 from 2. a.truncTrans() # remove PCR blowouts from trans data a.iterativeCorrectWithoutSS() #do iterative correction for name, exportname in names, exportnames:
def plotDiagonalCorrelation(resolution, filename1, filename2, experiment1, experiment2, genome, mouse=False, **kwargs): "Correlation of diagonal bins - paper figure" global pp if (options.verbose): print >> sys.stdout, "plotDiagonalCorrelation: res: %d file1: %s file2: %s exp1:%s exp2:%s gen:%s" % ( resolution, filename1, filename2, experiment1, experiment2, genome) S = 50 x = numpy.arange(2, S) Tanay = binnedData(resolution, genome) Tanay.simpleLoad(filename1, experiment1) Tanay.simpleLoad(filename2, experiment2) Tanay.removeDiagonal(1) Tanay.removePoorRegions() Tanay.removeZeros() pairs = [(experiment1, experiment2)] cors = [[] for _ in pairs] for i in x: for j, pair in enumerate(pairs): cors[j].append( cr(numpy.diagonal(Tanay.dataDict[pair[0]], i), numpy.diagonal(Tanay.dataDict[pair[1]], i))[0]) Tanay.iterativeCorrectWithoutSS(M=1) cors2 = [[] for _ in pairs] for i in x: for j, pair in enumerate(pairs): cors2[j].append( cr(numpy.diagonal(Tanay.dataDict[pair[0]], i), numpy.diagonal(Tanay.dataDict[pair[1]], i))[0]) Tanay.iterativeCorrectWithoutSS(M=20) cors3 = [[] for _ in pairs] for i in x: for j, pair in enumerate(pairs): cors3[j].append( cr(numpy.diagonal(Tanay.dataDict[pair[0]], i), numpy.diagonal(Tanay.dataDict[pair[1]], i))[0]) matplotlib.rcParams['font.sans-serif'] = 'Arial' print "Eigenvectors" print cors print cors2 print cors3 plt.figure(figsize=(8, 4)) ax = plt.gca() for j, pair in enumerate(pairs): plt.subplot(1, len(pairs), j) fs = 8 for xlabel_i in ax.get_xticklabels(): xlabel_i.set_fontsize(fs) for xlabel_i in ax.get_yticklabels(): xlabel_i.set_fontsize(fs) plt.title("%s vs %s" % pair) plt.plot(x / 5., cors3[j], color="#E5A826", label="Iterative") plt.plot(x / 5., cors2[j], color="#28459A", label="Single") plt.plot(x / 5., cors[j], color="#E55726", label="Raw") plt.xlabel("Genomic Separation, MB", fontsize=8) plt.ylabel("Spearman correlation", fontsize=8) plt.legend() legend = plt.legend(prop={"size": 6}, loc=9, handlelength=2) legend.draw_frame(False) plt.ylim((0, 1)) removeAxes(shift=0) plt.show() pp.savefig()
#IMPORTANT: use iter-corrected heatmaps here. Otherwise, take care about adjustment of total reads number when calculating mask_hugeDifference heatmap_filepath1 = base_folder + 'heatmap-res-' + str( domain_res / 1000) + 'KB_' + base_filename1 + '.hdf5' heatmap_filepath2 = base_folder + 'heatmap-res-' + str( domain_res / 1000) + 'KB_' + base_filename2 + '.hdf5' out_heatmap_filepath2 = base_folder + 'heatmap-res-' + str( domain_res / 1000 ) + 'KB_' + base_filename2 + '_compressed_as_' + base_filename1 + '.hdf5' figure_path = base_folder + 'heatmap-res-' + str( domain_res / 1000 ) + 'KB_' + base_filename2 + '_compressed_as_' + base_filename1 + '.png' print "Loading file " + heatmap_filepath1 BD1 = binnedData.binnedData(domain_res, genome_db1) BD1.simpleLoad(heatmap_filepath1, 'HindIII_GM_1') print "Loading file " + heatmap_filepath2 BD2 = binnedData.binnedData(domain_res, genome_db2) BD2.simpleLoad(heatmap_filepath2, 'HindIII_GM_1') q1 = BD1.dataDict['HindIII_GM_1'] q2 = BD2.dataDict['HindIII_GM_1'] #-----DEBUG------ #print "Plotting contact matrix" #plotting.plot_matrix(np.log(q2)) #plt.subplots_adjust(bottom=0.15) #print "Saving figure "+figure_path+'tmp.png' #f = open(figure_path+'tmp.png', "wb")
from hiclib import binnedData fnames = ["fname1", "fname2"] names = ["dataset 1", "dataset 2"] exportnames = ["fname1_corrected", "fname2_corrected"] resolution = 500000 genFolder = "/folder/to/the/genome/files/and/gap.txt/file/according/to/the/mirnylib.genome/class" #for one file it would be fnames = ["myfile.hm"] resolution = 500000 names = ["whatever"] exportnames = ["filename_corrected"] genFolder = "genomeFolder" a = binnedData.binnedData( resolution, genFolder) #folder should be openable by mirnylib.genome for name, fname, exportname in zip(names, fnames, exportnames): a.simpleLoad(fname, name) a.removeDiagonal() #we never ever use diagonal a.removeBySequencedCount( ) # new filter: omit all bins with less than 0.5 coverage by sequenced bases (i.e. bases present in the genome) a.removePoorRegions( cutoff=0.5, coverage=True ) # remove .5% bins with the lowest number of records (i.e. non-zero entrees in the matrix) # This filter was updated to remove bins which have zero contacts and one PCR blowout. Those bins would have many reads, but all reads will be with one or few other bins. a.removePoorRegions(
genome_db_chrmLevel = genome.Genome( "/mnt/storage/home/vsfishman/HiC/fasta/GalGal5/GCF_000002315.4_Gallus_gallus-5.0_assembly_structure/Primary_Assembly/assembled_chromosomes/FASTA/", readChrms=[], chrmFileTemplate="%s.fna") hm_file = "/mnt/storage/home/vsfishman/HiC/tutorial_Fishman/chick/mapped-GalGal5filtered/GalGal5filteredChrmLevel/ChEF-all-HindIII-100k.hm" f_out_path = hm_file + '.eig' NumEigenvectors = 1 # number of eigenvectors to compute # Read resolution from one of the datasets resolution = extractResolutionFromFileName(hm_file) # Define the binnedData object, load data BD = binnedData(resolution, genome_db_chrmLevel) BD.simpleLoad(hm_file, 'heatmap') BD.removeDiagonal() # Remove bins with less than half of a bin sequenced BD.removeBySequencedCount(0.5) # We'll do iterative correction and Eigenvector expansion on trans data only! # We want to remove cis, because later we want to remove poor regions in trans BD.removeCis() # Truncate top 0.05% of interchromosomal counts (possibly, PCR blowouts) # Do this before removing poor regions, because single blowouts may give # lots of contacts to a region which does not have much contacts otehrwise. BD.truncTrans(high=0.0005)
def get_by_chr_E1(genome_db, resolution): if heatmap_filepath.endswith(".IC"): raw = heatmap_filepath[:-3] else: raw = heatmap_filepath print "Using raw heatmap ", raw global BD_raw BD_raw = binnedData.binnedData(resolution, genome_db) BD_raw.simpleLoad(raw, 'heatmap') BD_raw.removeDiagonal() # Remove bins with less than half of a bin sequenced BD_raw.removeBySequencedCount(0.5) # We'll do iterative correction and Eigenvector expansion on trans data only! # We want to remove cis, because later we want to remove poor regions in trans BD_raw.removeCis() # Truncate top 0.05% of interchromosomal counts (possibly, PCR blowouts) # Do this before removing poor regions, because single blowouts may give # lots of contacts to a region which does not have much contacts otehrwise. BD_raw.truncTrans(high=0.0005) # Remove 1% of regions with low coverage BD_raw.removePoorRegions(cutoff=1) # Fake cis counts. Data gets iteratively corrected during this process... BD_raw.fakeCis() # Remove bins with zero counts for eigenvector analysis --> This will be done for each chromosome in for loop # BD.removeZeros() # Perform eigenvector expansion. result = {"OE": {}, "Classic": {}, "genome_wide_Classic": {}} genom_wide_E1 = np.genfromtxt(raw + ".eig", dtype=None)['f2'] for chrom in range(genome_db.chrmCount): st = genome_db.chrmStartsBinCont[chrom] end = genome_db.chrmEndsBinCont[chrom] cur = BD_raw.dataDict['heatmap'][st:end, st:end] mask = np.sum(cur, axis=0) > 0 if sum(mask) > 5: cur = cur[mask] cur = cur[:, mask] currentEIG, eigenvalues = EIG(cur, numPCs=1) if spearmanr(currentEIG[0], BD_raw.trackDict["GC"][st:end][mask])[0] < 0: currentEIG[0] = -currentEIG[0] E1 = np.empty(shape=(len(mask), )) * np.nan E1[mask] = currentEIG[0] result["Classic"][chrom] = E1 cur = observedOverExpected(cur) mask = np.sum(cur, axis=0) > 0 if sum(mask) > 5: cur = cur[mask] cur = cur[:, mask] currentEIG, eigenvalues = EIG(cur, numPCs=1) if spearmanr(currentEIG[0], BD_raw.trackDict["GC"][st:end][mask])[0] < 0: currentEIG[0] = -currentEIG[0] E1 = np.empty(shape=(len(mask), )) * np.nan E1[mask] = currentEIG[0] result["OE"][chrom] = E1 result["genome_wide_Classic"][chrom] = genom_wide_E1[st:end] return result
########################### #1. Parse contig_names_to_id_file LACHES_index_converter = {} with open(contig_names_to_id_file) as f: for line in f: line = line.strip().split() LACHES_index_converter[line[1]] = line[0] out_file = open(out_file, "w") out_file.write(header_string) raw_heatmap = h5dict.h5dict(basefolder + filename, mode='r') resolution = int(raw_heatmap['resolution']) BD = binnedData.binnedData(resolution, genome_db) BD.simpleLoad(basefolder + filename, 'HindIII') q = BD.dataDict['HindIII'] interchr_contacts = np.zeros(shape=(genome_db.chrmCount, genome_db.chrmCount)) ############################ #genome_db.chrmCount=100 ############################ zero_number_of_contacts, nonzero_number_of_contacts = [], [] for chr1 in xrange(genome_db.chrmCount): for chr2 in xrange(chr1 + 1, genome_db.chrmCount): Ncontacts = q[ genome_db.chrmStartsBinCont[chr1]:genome_db.chrmEndsBinCont[chr1], genome_db.chrmStartsBinCont[chr2]:genome_db.chrmEndsBinCont[chr2]]
def step3(hiclib_path, sraid, res=1000000): ''' 3. Filter and iteratively correct heatmaps. http://mirnylab.bitbucket.org/hiclib/tutorial/03_heatmap_processing.html ''' import matplotlib.pyplot as plt import numpy as np from mirnylib import genome from mirnylib import h5dict from mirnylib import plotting from hiclib import binnedData genome_db = genome.Genome(hiclib_path + '/fasta/hg19', readChrms=['#', 'X']) # Read resolution from the dataset. raw_heatmap = h5dict.h5dict(sraid + '_map-res%sk.hdf5' % (res / 1000), mode='r') resolution = int(raw_heatmap['resolution']) # Create a binnedData object, load the data. BD = binnedData.binnedData(resolution, genome_db) BD.simpleLoad(sraid + '_map-res%sk.hdf5' % (res / 1000), 'DataName') # Plot the heatmap directly. plotting.plot_matrix(np.log(BD.dataDict['DataName'])) plt.savefig(sraid + '_map-res%sk.pdf' % (res / 1000)) plt.clf() # Remove the contacts between loci located within the same bin. BD.removeDiagonal() # Remove bins with less than half of a bin sequenced. BD.removeBySequencedCount(0.5) # Remove 1% of regions with low coverage. BD.removePoorRegions(cutoff=1) # Truncate top 0.05% of interchromosomal counts (possibly, PCR blowouts). BD.truncTrans(high=0.0005) # Perform iterative correction. BD.iterativeCorrectWithoutSS() # Save the iteratively corrected heatmap. BD.export('DataName', sraid + '_map-res%sk-ic.hdf5' % (res / 1000)) # Plot the heatmap directly. plotting.plot_matrix(np.log(BD.dataDict['DataName'])) plt.savefig(sraid + '_map-res%sk-ic.pdf' % (res / 1000)) plt.clf() # Save Bias outfile = open(sraid + "_map-res%sk-ic-bias.txt" % (res / 1000), "w") for i in xrange(len(BD.chromosomeIndex)): chro = BD.genome.idx2label[BD.chromosomeIndex[i]] posi = BD.positionIndex[i] outfile.write("chr%s\t%s\t%s" % (chro, posi, posi + res)) outfile.write("\t%s" % BD.biasDict['DataName'][i]) outfile.write("\n") outfile.close()