def buildAllHeatmap(self, resolution, countDiagonalReads = 'Once'): # 8 bytes per record + heatmap self.genome.setResolution(resolution) numBins = self.genome.numBins label = self.genome.chrmStartsBinCont[self.chrms1] label = np.asarray(label, dtype="int64") label += self.mids1 / resolution label *= numBins label += self.genome.chrmStartsBinCont[self.chrms2] label += self.mids2 / resolution counts = np.bincount(label, minlength=numBins ** 2) if len(counts) > numBins ** 2: raise StandardError("\nHeatMap exceed length of the genome!") counts.shape = (numBins, numBins) for i in xrange(len(counts)): counts[i, i:] += counts[i:, i] counts[i:, i] = counts[i, i:] if countDiagonalReads.lower() == "once": diag = np.diag(counts) fillDiagonal(counts, diag / 2) elif countDiagonalReads.lower() == "twice": pass else: raise ValueError("Bad value for countDiagonalReads") return counts
def buildAllHeatmap(self, resolution): for start, end in self._getChunks(30000000): # 8 bytes per record + heatmap self.genome.setResolution(resolution) numBins = self.genome.numBins label = self.genome.chrmStartsBinCont[self._getVector( "chrms1", start, end)] label = np.asarray(label, dtype="int64") label += (self._getVector("mids1", start, end) // resolution).astype(np.int64) label *= numBins label += self.genome.chrmStartsBinCont[self._getVector( "chrms2", start, end)] label += (self._getVector("mids2", start, end) // resolution).astype(np.int64) counts = np.bincount(label, minlength=numBins**2) if len(counts) > numBins**2: raise StandardError("\nHeatMap exceed length of the genome!") counts.shape = (numBins, numBins) try: heatmap += counts # @UndefinedVariable except: heatmap = counts for i in range(len(heatmap)): heatmap[i, i:] += heatmap[i:, i] heatmap[i:, i] = heatmap[i, i:] diag = np.diag(heatmap) fillDiagonal(heatmap, diag / 2) return heatmap
def setMatrilocScaling(inMatriloc, alpha=0): inMatriloc = inMatriloc.copy() N = len(inMatriloc) Pc, mids = getLogBinnedScaling(inMatriloc, isCircular=True) for i in range(N): fillDiagonal(inMatriloc, np.diagonal(inMatriloc, i) / Pc[i] / (i**(-alpha)), i) return np.triu(inMatriloc) + np.triu(inMatriloc).T
def saveByChromosomeHeatmap(self, filename, resolution, gInfo, includeTrans=False): self.genome.setResolution(resolution) mydict = h5dict(filename) for chrom in range(self.genome.chrmCount): c1 = self.h5dict.get_dataset("chrms1") p1 = self.h5dict.get_dataset("cuts1") low = h5dictBinarySearch(c1, p1, (chrom, -1), "left") high = h5dictBinarySearch(c1, p1, (chrom, 999999999), "right") chr1 = self._getVector("chrms1", low, high) chr2 = self._getVector("chrms2", low, high) pos1 = np.array(self._getVector("mids1", low, high) // resolution, dtype=np.int32) pos2 = np.array(self._getVector("mids2", low, high) // resolution, dtype=np.int32) assert (chr1 == chrom).all() # getting sure that bincount worked args = np.argsort(chr2) chr2 = chr2[args] pos1 = pos1[args] pos2 = pos2[args] for chrom2 in range(chrom, self.genome.chrmCount): if (includeTrans == False) and (chrom2 != chrom): continue start = np.searchsorted(chr2, chrom2, "left") end = np.searchsorted(chr2, chrom2, "right") cur1 = pos1[start:end] cur2 = pos2[start:end] label = np.array(cur1, "int64") label *= self.genome.chrmLensBin[chrom2] label += cur2 maxLabel = self.genome.chrmLensBin[chrom] * \ self.genome.chrmLensBin[chrom2] counts = np.bincount(label, minlength=maxLabel) mymap = counts.reshape((self.genome.chrmLensBin[chrom], -1)) if chrom == chrom2: mymap = mymap + mymap.T fillDiagonal(mymap, np.diag(mymap).copy() / 2) mydict["%d %d" % (chrom, chrom2)] = mymap mydict['resolution'] = resolution mydict['genomeInformation'] = gInfo return
def saveAllDatasets(): """ An example which saves the heatmap in different colormaps. It was used to choose the colormap out of the ones we created. """ if not os.path.exists("savedHeatmaps"): os.mkdir("savedHeatmaps") heatmaps = ["jet", "fall", "blues" "acidblues"] for name, heatmap in zip(names, heatmaps)[::-1]: for dataset in datasets: hm = "data/dumped/%s-10k_overlap.hm_corrected" % dataset plt.figure() data = np.loadtxt(hm) fillDiagonal(data, np.mean(np.diag(data, 2)) * 1.1, 1) fillDiagonal(data, np.mean(np.diag(data, 2)) * 1.1 , -1) fillDiagonal(data, np.mean(np.diag(data, 2)) * 1.2, 0) plt.imshow(data, origin="lower", cmap=heatmap, interpolation="none", vmin=0, vmax=0.035) plt.xticks([0, 100, 200, 300, 400], ["0", "1Mb", "2Mb", "3Mb", "4Mb"]) plt.yticks([0, 100, 200, 300, 400], ["0", "1Mb", "2Mb", "3Mb", "4Mb"]) plt.colorbar(orientation="vertical", ticks=[0, 0.01, 0.02, 0.03]) ax = plt.gca() for i, line in enumerate(ax.get_xticklines() + ax.get_yticklines()): if i % 2 == 1: # odd indices line.set_visible(False) #plt.show() plt.savefig("/home/magus/Dropbox/Caulobacter-chromosome/heatmapsAllFromMax/%s_%s.pdf" % (dataset, name))
def saveAllDatasets(): """ An example which saves the heatmap in different colormaps. It was used to choose the colormap out of the ones we created. """ if not os.path.exists("savedHeatmaps"): os.mkdir("savedHeatmaps") heatmaps = ["jet", "fall", "blues" "acidblues"] for name, heatmap in zip(names, heatmaps)[::-1]: for dataset in datasets: hm = "data/dumped/%s-10k_overlap.hm_corrected" % dataset plt.figure() data = np.loadtxt(hm) fillDiagonal(data, np.mean(np.diag(data, 2)) * 1.1, 1) fillDiagonal(data, np.mean(np.diag(data, 2)) * 1.1, -1) fillDiagonal(data, np.mean(np.diag(data, 2)) * 1.2, 0) plt.imshow(data, origin="lower", cmap=heatmap, interpolation="none", vmin=0, vmax=0.035) plt.xticks([0, 100, 200, 300, 400], ["0", "1Mb", "2Mb", "3Mb", "4Mb"]) plt.yticks([0, 100, 200, 300, 400], ["0", "1Mb", "2Mb", "3Mb", "4Mb"]) plt.colorbar(orientation="vertical", ticks=[0, 0.01, 0.02, 0.03]) ax = plt.gca() for i, line in enumerate(ax.get_xticklines() + ax.get_yticklines()): if i % 2 == 1: # odd indices line.set_visible(False) #plt.show() plt.savefig( "/home/magus/Dropbox/Caulobacter-chromosome/heatmapsAllFromMax/%s_%s.pdf" % (dataset, name))
def saveByChromosomeHeatmap(self, filename, resolution = 40000, includeTrans = False, countDiagonalReads = "Once"): """ Saves chromosome by chromosome heatmaps to h5dict. This method is not as memory demanding as saving all x all heatmap. Keys of the h5dict are of the format ["1 1"], where chromosomes are zero-based, and there is one space between numbers. Parameters ---------- filename : str Filename of the h5dict with the output resolution : int Resolution to save heatmaps includeTrans : bool, optional Build inter-chromosomal heatmaps (default: False) countDiagonalReads : "once" or "twice" How many times to count reads in the diagonal bin """ if countDiagonalReads.lower() not in ["once", "twice"]: raise ValueError("Bad value for countDiagonalReads") self.genome.setResolution(resolution) pos1 = self.evaluate("a = np.array(mids1 / {res}, dtype = 'int32')" .format(res=resolution), "mids1") pos2 = self.evaluate("a = np.array(mids2 / {res}, dtype = 'int32')" .format(res=resolution), "mids2") chr1 = self.chrms1 chr2 = self.chrms2 # DS = self.DS # 13 bytes per read up to now, 16 total mydict = h5dict(filename) for chrom in xrange(self.genome.chrmCount): if includeTrans == True: mask = ((chr1 == chrom) + (chr2 == chrom)) else: mask = ((chr1 == chrom) * (chr2 == chrom)) # Located chromosomes and positions of chromosomes c1, c2, p1, p2 = chr1[mask], chr2[mask], pos1[mask], pos2[mask] if includeTrans == True: # moving different chromosomes to c2 # c1 == chrom now mask = (c2 == chrom) * (c1 != chrom) c1[mask], c2[mask], p1[mask], p2[mask] = c2[mask].copy(), c1[ mask].copy(), p2[mask].copy(), p1[mask].copy() del c1 # ignore c1 args = np.argsort(c2) c2 = c2[args] p1 = p1[args] p2 = p2[args] for chrom2 in xrange(chrom, self.genome.chrmCount): if (includeTrans == False) and (chrom2 != chrom): continue start = np.searchsorted(c2, chrom2, "left") end = np.searchsorted(c2, chrom2, "right") cur1 = p1[start:end] cur2 = p2[start:end] label = np.asarray(cur1, "int64") label *= self.genome.chrmLensBin[chrom2] label += cur2 maxLabel = self.genome.chrmLensBin[chrom] * \ self.genome.chrmLensBin[chrom2] counts = np.bincount(label, minlength = maxLabel) assert len(counts) == maxLabel mymap = counts.reshape((self.genome.chrmLensBin[chrom], -1)) if chrom == chrom2: mymap = mymap + mymap.T if countDiagonalReads.lower() == "once": fillDiagonal(mymap, np.diag(mymap).copy() / 2) mydict["%d %d" % (chrom, chrom2)] = mymap mydict['resolution'] = resolution return
def saveHiResHeatmapWithOverlaps(self, filename, resolution=10000, countDiagonalReads="Once", maxBinSpawn=10, chromosomes="all"): """ Creates within-chromosome heatmaps at very high resolution, assigning each fragment to all the bins it overlaps with, proportional to the area of overlaps. Parameters ---------- resolution : int or str Resolution of a heatmap. countDiagonalReads : "once" or "twice" How many times to count reads in the diagonal bin maxBinSpawn : int, optional, not more than 10 Discard read if it spawns more than maxBinSpawn bins """ from scipy import weave tosave = h5dict(filename) self.genome.setResolution(resolution) if chromosomes == "all": chromosomes = range(self.genome.chrmCount) for chrom in chromosomes: mask = (self.chrms1 == chrom) * (self.chrms2 == chrom) if mask.sum() == 0: continue low1 = (self.mids1[mask] - self.fraglens1[mask] / 2) / float(resolution) high1 = (self.mids1[mask] + self.fraglens1[mask] / 2) / float(resolution) low2 = (self.mids2[mask] - self.fraglens2[mask] / 2) / float(resolution) high2 = (self.mids2[mask] + self.fraglens2[mask] / 2) / float(resolution) del mask N = len(low1) heatmapSize = int(self.genome.chrmLensBin[chrom]) heatmap = np.zeros((heatmapSize, heatmapSize), dtype="float64", order="C") code = """ double vector1[100]; double vector2[100]; for (int readNum = 0; readNum < N; readNum++) { for (int i=0; i<10; i++) { vector1[i] = 0; vector2[i] = 0; } double l1 = low1[readNum]; double l2 = low2[readNum]; double h1 = high1[readNum]; double h2 = high2[readNum]; if ((h1 - l1) > maxBinSpawn) continue; if ((h2 - l2) > maxBinSpawn) continue; int binNum1 = ceil(h1) - floor(l1); int binNum2 = ceil(h2) - floor(l2); double binLen1 = h1 - l1; double binLen2 = h2 - l2; int b1 = floor(l1); int b2 = floor(l2); if (binNum1 == 1) vector1[0] = 1.; else { vector1[0] = (ceil(l1 + 0.00001) - l1) / binLen1; for (int t = 1; t< binNum1 - 1; t++) {vector1[t] = 1. / binLen1;} vector1[binNum1 - 1] = (h1 - floor(h1)) / binLen1; } if (binNum2 == 1) vector2[0] = 1.; else { vector2[0] = (ceil(l2 + 0.0001) - l2) / binLen2; for (int t = 1; t< binNum2 - 1; t++) {vector2[t] = 1. / binLen2;} vector2[binNum2 - 1] = (h2 - floor(h2)) / binLen2; } for (int i = 0; i< binNum1; i++) { for (int j = 0; j < binNum2; j++) { heatmap[(b1 + i) * heatmapSize + b2 + j] += vector1[i] * vector2[j]; } } } """ weave.inline(code, [ 'low1', "high1", "low2", "high2", "N", "heatmap", "maxBinSpawn", "heatmapSize", ], extra_compile_args=['-march=native -O3 '], support_code=r""" #include <stdio.h> #include <math.h>""") del high1, low1, high2, low2 for i in xrange(len(heatmap)): heatmap[i, i:] += heatmap[i:, i] heatmap[i:, i] = heatmap[i, i:] if countDiagonalReads.lower() == "once": diag = np.diag(heatmap).copy() fillDiagonal(heatmap, diag / 2) del diag elif countDiagonalReads.lower() == "twice": pass else: raise ValueError("Bad value for countDiagonalReads") tosave["{0} {0}".format(chrom)] = heatmap tosave.flush() del heatmap weave.inline("") # to release all buffers of weave.inline import gc gc.collect() tosave['resolution'] = resolution
def cis_eig(A, k=3, robust=True, gc=None, classic=False): """ Compute compartment eigenvector on a cis matrix Parameters ---------- A : 2D array balanced whole genome contact matrix k : int number of eigenvectors to compute; default = 3 robust : bool Clip top 0.1 percentile and smooth first two diagonals gc : 1D array, optional GC content per bin for choosing and orienting the primary compartment eigenvector; not performed if no array is provided classic : bool Do it old-school Returns ------- eigenvalues, eigenvectors """ A = np.array(A) A[~np.isfinite(A)] = 0 mask = A.sum(axis=0) > 0 if A.shape[0] <= 5 or mask.sum() <= 5: return (np.array([np.nan for i in range(k)]), np.array([np.ones(A.shape[0]) * np.nan for i in range(k)])) if robust: A = np.clip(A, 0, np.percentile(A, 99.9)) fill_value = np.mean(np.diag(A, 2) * 2) for d in [-1, 0, 1]: numutils.fillDiagonal(A, fill_value, d) A[~mask, :] = 0 A[:, ~mask] = 0 OE = numutils.observedOverExpected(A[mask, :][:, mask]) if robust: OE = np.clip(OE, 0, np.percentile(OE, 99.9)) if classic: OE = numutils.iterativeCorrection(OE)[0] if (~np.isfinite(OE)).sum() > 0: return ( np.array([np.ones(A.shape[0]) * np.nan for i in range(k)]), np.array([np.nan for i in range(k)]), ) # mean-centered (subtract mean) eigvecs_compressed, eigvals = numutils.EIG(OE, k) else: eigvecs_compressed, eigvals = numutils.EIG((OE - 1.0), k, subtractMean=False, divideByMean=False) # Restore full eigs eigvecs = [] for i in range(k): v = np.ones(mask.shape[0]) * np.nan v[mask] = eigvecs_compressed[i] eigvecs.append(v) eigvecs = np.array(eigvecs) # Orient and reorder eigvals, eigvecs = _orient_eigs(eigvals, eigvecs, gc) return eigvals, eigvecs
def Generate_one_chromosome_file(chrNumb): o_file = base_out_folder + "fitHiC/i_files/" + base_filename + ".fithic" fragment_dataset_filename = base_out_folder + "fitHiC/i_files/" + 'fragment_dataset_' + base_filename + '_chr' + str( chrNumb) + '.hdf5' if not os.path.isfile(fragment_dataset_filename): fragments = fragmentHiC.HiCdataset(filename=fragment_dataset_filename, genome=genome_db, maximumMoleculeLength=500, mode='w') fragments.parseInputData(dictLike=maped_reads_filepath, removeSS=True) fragments.filterRsiteStart(offset=5) fragments.filterDuplicates() fragments.filterLarge() fragments.filterExtreme(cutH=0.005, cutL=0) else: fragments = fragmentHiC.HiCdataset(filename=fragment_dataset_filename, genome=genome_db, maximumMoleculeLength=500, mode='a') print "Filtering, leaving only chr ", genome_db.idx2label[chrNumb] #leave only frgaments from the chrNumb (nterchromosomal) fragments.maskFilter((fragments.chrms1 == chrNumb)) fragments.maskFilter((fragments.chrms2 == chrNumb)) print "Seting RE" #Setting info about restriction enzyme, calculating absolute indexes fragments.setRfragAbsIdxs('HindIII') numBins = len(fragments.genome.rsites[chrNumb]) print "Total numBins (RSites) on chr ", genome_db.idx2label[ chrNumb], " = ", numBins rfragAbsIdxs1 = fragments.rfragAbsIdxs1 - fragments.genome.chrmStartsRfragCont[ chrNumb] rfragAbsIdxs2 = fragments.rfragAbsIdxs2 - fragments.genome.chrmStartsRfragCont[ chrNumb] print "Total number of fragments = ", len(rfragAbsIdxs1) if len(rfragAbsIdxs1) != len(rfragAbsIdxs2): print "rfragAbsIdxs1=", rfragAbsIdxs1 print "rfragAbsIdxs2=", rfragAbsIdxs2 print "len(rfragAbsIdxs1)=", len(rfragAbsIdxs1) print "len(rfragAbsIdxs2)=", len(rfragAbsIdxs2) raise "FRAGMENT INDEXING ERROR 1!!!" if (min(rfragAbsIdxs1) < 0 or min(rfragAbsIdxs2) < 0): print "min(rfragAbsIdxs1)=", min(rfragAbsIdxs1) print "min(rfragAbsIdxs2)=", min(rfragAbsIdxs2) raise "FRAGMENT INDEXING ERROR 2!!!" if (max(rfragAbsIdxs1) > numBins - 1 or max(rfragAbsIdxs2) > numBins - 1): print "max (rfragAbsIdxs1)=", max(rfragAbsIdxs1) print "max (rfragAbsIdxs2)=", max(rfragAbsIdxs2) print "numBins=", numBins raise "FRAGMENT INDEXING ERROR 3!!!" print "FRAGMENT INDEXING - passed" #Creating label array label = np.array(rfragAbsIdxs1, dtype='int64') label *= numBins label += rfragAbsIdxs2 #Creating count array counts = np.bincount(label, minlength=numBins**2) counts.shape = (numBins, numBins) #Counting for i in xrange(len(counts)): counts[i, i:] += counts[i:, i] counts[i:, i] = counts[i, i:] #Filling diagonal reads #diag = np.diag(counts) #fillDiagonal(counts, diag/2) fillDiagonal(counts, 0) BinsToDescribe = np.zeros( numBins ) # Info about which RSites should be described in .fragments file later # f_out = gzip.open (o_file+"_chr"+str(chrNumb)+".contacts.zip","w") f_out = open(o_file + "_chr" + str(chrNumb) + ".contacts.zip", "w") print "Writing file ", o_file + "_chr" + str(chrNumb) + ".contacts.zip" for i in range(numBins - 1): for j in range(i + 1, numBins): if (counts[i, j] != 0): s = "" s += str(chrNumb) + "\t" s += str(fragments.genome.rfragMids[chrNumb][i]) + "\t" s += str(chrNumb) + "\t" s += str(fragments.genome.rfragMids[chrNumb][j]) + "\t" s += str(counts[i, j]) + "\n" f_out.write(s) BinsToDescribe[i] = 1 BinsToDescribe[j] = 1 f_out.close() # f_out = gzip.open (o_file+"_chr"+str(chrNumb)+".fragments.zip","w") f_out = open(o_file + "_chr" + str(chrNumb) + ".fragments.zip", "w") print "Writing file ", o_file + "_chr" + str(chrNumb) + ".fragments.zip" for ind, val in enumerate(BinsToDescribe): if (val == 1): s = "" s += str(chrNumb) + "\t0\t" s += str(fragments.genome.rfragMids[chrNumb][ind]) + "\t" s += str(sum(counts[ind])) + "\t" s += "1\n" f_out.write(s) f_out.close()