def prepare_for_PCA(self, data, boundaries=None, iterations=3): if boundaries is None: boundaries = self.boundaries from mirnylib.numutils import completeIC for i in range(iterations): newdata = self.substitute_intra_by_inter(data, boundaries) newdata = completeIC(newdata) return newdata
def iterativeCorrection(self, outname): mydict = h5dict(outname) for key in self.cisKeys: bychr = self.rawdata[key] corrected = completeIC(bychr, returnBias=False) mydict[key] = corrected mydict["resolution"] = self.resolution
def prepare_for_PCA(self, data, boundaries=None, iterations=3): if boundaries is None: boundaries = self.boundaries from mirnylib.numutils import completeIC for i in range(iterations): newdata = self.substitute_intra_by_inter(data, boundaries) newdata = completeIC(newdata) return newdata
def showCmap(): """Shows Hi-C data together with the simulated data. Hi-C data created by hiclib is needed for that, but you can replace the line mydict=h5dict()... and the following line with your own data loading code. """ low = 60000 high = 75000 lowMon = low * 1000 // 600 highMon = high * 1000 // 600 low20 = low // 10 high20 = high // 10 # here Hi-C data is loaded for display purposes only..... replace it with your own code if your data is in a different format mydict = h5dict("/home/magus/HiC2011/Erez2014/hg19/GM12878_inSitu-all-combined-10k_HighRes.byChr",'r') hicdata = mydict.get_dataset("13 13")[low20:high20, low20:high20] hicdata = completeIC(hicdata) curshape = hicdata.shape newshape = (1000 * (high - low)) // (600 * 5) print(hicdata.shape, newshape) hicdata = zoomArray(hicdata, (newshape, newshape)) hicdata = np.clip(hicdata, 0, np.percentile(hicdata, 99.99)) hicdata /= np.mean(np.sum(hicdata, axis=1)) #hicdata = hm / np.mean(np.sum(hm, axis=1)) for fname in os.listdir("cmaps"): cmap = pickle.load(open(os.path.join("cmaps", fname), 'rb')) #arr = coarsegrain(cmap, 2) arr = cmap if arr.shape[0] != hicdata.shape[0]: continue print(arr.shape) arr = arr / np.mean(np.sum(arr, axis=1)) ran = np.arange(len(arr)) mask = ran[:,None] > ran[None,:] arr[mask] = hicdata[mask] logarr = np.log(arr + 0.0001) # noinspection PyTypeChecker plt.imshow(logarr, vmax = np.percentile(logarr, 99.99), vmin = np.percentile(logarr, 10), extent = [low, high, high, low], interpolation = "none") plt.savefig(os.path.join("heatmaps", fname+".png")) plt.savefig(os.path.join("heatmaps", fname+".pdf")) plt.show() plt.clf()
def doSaddleError(filename, eig, gen, correct=False): gen = Genome("/home/magus/HiC2011/data/" + gen, readChrms=["#", "X"]) cur = 0 data = h5dict(filename,'r')["heatmap"] if correct: data = completeIC(data) gen.setResolution(getResolution(filename)) if eig == "GC": eig = np.concatenate(gen.GCBin) saddles = [] permutted = [] saddle = np.zeros((5,5), dtype = float) for i in range(100): permutted.append(np.zeros((5,5), dtype = float)) for chrom in range(gen.chrmCount): st = gen.chrmStartsBinCont[chrom] end = gen.chrmEndsBinCont[chrom] cur = data[st:end, st:end] cur = observedOverExpected(cur) mask = np.sum(cur , axis=0) > 0 cur = cur [mask] cur = cur [:, mask] GC = eig[st:end] GC = GC[mask] if len(GC) > 5: for i in range(5): for j in range(5): G1, G2 = np.percentile(GC, [20 * i, 20 * i + 20]) mask1 = (GC > G1) * (GC < G2) G1, G2 = np.percentile(GC, [20 * j, 20 * j + 20]) mask2 = (GC > G1) * (GC < G2) addition = cur[np.ix_(mask1, mask2)] addition = np.reshape(addition, (-1)) for k in range(100): resampled = np.random.choice(addition, len(addition), replace=True) permutted[k][i,j] += resampled.mean() saddle[i, j] += addition.mean() return saddle, permutted
def run(): # Parse Arguments args, commands = getargs() # Improve the performance if you don't want to run it if commands[0] not in ['-h', '--help']: ## Root Logger Configuration logger = logging.getLogger() # Logger Level logger.setLevel(10) console = logging.StreamHandler() filehandler = logging.handlers.RotatingFileHandler(args.logFile, maxBytes=100000, backupCount=5) # Set level for Handlers console.setLevel('INFO') filehandler.setLevel('DEBUG') # Customizing Formatter formatter = logging.Formatter( fmt='%(name)-14s %(levelname)-7s @ %(asctime)s: %(message)s', datefmt='%m/%d/%y %H:%M:%S') console.setFormatter(formatter) filehandler.setFormatter(formatter) # Add Handlers logger.addHandler(console) logger.addHandler(filehandler) ## Logging for argument setting arglist = [ '# ARGUMENT LIST:', '# output file prefix = %s' % args.output, '# HiC Data Path = %s' % args.path, '# chromosomes = %s' % args.chroms, '# data resolution = %s' % args.resolution, '# Peak window width = %s' % args.pw, '# Donut size = %s' % args.ww, '# Maximum donut size = %s' % args.maxww, '# Significant Level = %s' % args.siglevel, '# Genomic distance range = %s' % [args.ww * args.resolution, args.maxapart] ] argtxt = '\n'.join(arglist) logger.info('\n' + argtxt) # Package Dependencies from mirnylib.numutils import completeIC logger.info('Locating Hi-C data ...') Lib = np.load(args.path) logger.info('Calling Peaks ...') OF = open('.'.join([args.output, 'peaks', 'txt']), 'wb') head = '\t'.join([ 'chromLabel', 'loc_1', 'loc_2', 'IF', 'D-Enrichment', 'D-pvalue', 'D-qvalue', 'LL-Enrichment', 'LL-pvalue', 'LL-qvalue' ]) + '\n' OF.write(head) for key in Lib.files: if ((not args.chroms) or (key.isdigit() and '#' in args.chroms) or (key in args.chroms)): logger.info('Chromosome %s ...', key) sparseH = Lib[key].reshape(1)[0] triuH = sparseH.toarray() H = triuH + triuH.T - np.diag( triuH.diagonal()) # Symmetric Matrix del sparseH, triuH # Release Memory logger.info('Perform ICE ...') cHeatMap, biases = completeIC(H, returnBias=True) logger.info('Done!') logger.info('Customize Sparse Matrix ...') chromLen = H.shape[0] num = args.maxapart // args.resolution + args.maxww + 1 Diags = [np.diagonal(H, i) for i in np.arange(num)] M = sparse.diags(Diags, np.arange(num), format='csr') x = np.arange(args.ww, num) y = [] cDiags = [] for i in x: diag = np.diagonal(cHeatMap, i) y.append(diag.mean()) cDiags.append(diag) cM = sparse.diags(cDiags, x, format='csr') IR = isotonic.IsotonicRegression(increasing='auto') IR.fit(x, y) del H, cHeatMap Donuts, LL = pcaller(M, cM, biases, IR, chromLen, Diags, cDiags, num, pw=args.pw, ww=args.ww, sig=args.siglevel, maxww=args.maxww, maxapart=args.maxapart, res=args.resolution) for i in Donuts: lineFormat = '%s\t%d\t%d\t%.4g\t%.4g\t%.4g\t%.4g\t%.4g\t%.4g\t%.4g\n' contents = (key, ) + i + Donuts[i] + LL[i][1:] line = lineFormat % contents OF.write(line) OF.flush() OF.close() logger.info('Done!')
#!/usr/bin/python import sys import numpy as np from mirnylib.numutils import completeIC inp_file = sys.argv[1] percent_cutoff = float(sys.argv[2]) out_file = sys.argv[3] # read matrix inp_matrix = np.loadtxt(inp_file) # filter matrix by row mean m = np.mean(inp_matrix, axis=1) cutoff = np.sort(m)[int(round(len(m) * percent_cutoff))] i = np.where(m < cutoff) inp_matrix[i, :] = 0 inp_matrix[:, i] = 0 # IC out_matrix = completeIC(inp_matrix) np.savetxt(out_file, out_matrix, fmt='%.3e') # TODO: save in scientific notation
def showCmapNew(): """Saves a bunch of heatmaps at high resolutions.""" plt.figure(figsize=(8,8)) low = 60000 high = 75000 lowMon = low * 1000 // 600 highMon = high * 1000 // 600 low20 = low // 10 high20 = high // 10 mydict = h5dict("/home/magus/HiC2011/Erez2014/hg19/GM12878_inSitu-all-combined-10k_HighRes.byChr",'r') hicdata = mydict.get_dataset("13 13")[low20:high20, low20:high20] hicdata = completeIC(hicdata) curshape = hicdata.shape resolutionMon = 5 newshape = (1000 * (high - low)) // (600 * resolutionMon) print(hicdata.shape, newshape) hicdata = zoomArray(hicdata, (newshape, newshape)) hicdata = np.clip(hicdata, 0, np.percentile(hicdata, 99.99)) hicdata /= np.mean(np.sum(hicdata, axis=1)) #hicdata = hm / np.mean(np.sum(hm, axis=1)) #for fname in os.listdir("cmaps"): for fname in ["cmapflagshipLifetime300Mu3_r=8.pkl"]: if ("r=8" not in fname) or ("Lifetime" not in fname): print("not going", fname) continue try: mu = float(fname.split("_r=")[0].split("Mu")[1]) except: continue forw, rev = getForwBacv(mu) cmap = pickle.load(open(os.path.join("cmaps", fname), 'rb')) #arr = coarsegrain(cmap, 2) arr = cmap if arr.shape[0] != hicdata.shape[0]: continue arr = arr / np.mean(np.sum(arr,axis=1)) hicdata *= 1.5 diags = 1000 print(arr.shape) ax = plt.subplot(211) turned = pivotHeatmap(arr, diags)[::-1] * 3 turned2 = pivotHeatmap(hicdata, diags) turned = np.concatenate([turned, turned2], axis=0) myextent = [low, high, -(high - low) * diags/ len(arr) , (high - low) * diags/ len(arr) ] plt.imshow(np.log(turned + 0.0001) , aspect=0.5,cmap = "fall", vmax = -4, vmin = -8, extent=myextent , interpolation = "none") #plt.colorbar() #plt.ylim([-(high - low) * diags/ len(arr) , (high - low) * diags/ len(arr) ]) #nicePlot(show=False) plt.subplot(413, sharex = ax) xaxis=np.arange(len(forw)// 20) * 12 + 60000 forwcg = coarsegrain(forw,20) revcg = coarsegrain(rev, 20) plt.vlines(xaxis[forwcg>0], 0, forwcg[forwcg>0], color = "blue") plt.vlines(xaxis[revcg>0], 0, revcg[revcg>0], color = "green") #plt.scatter(xaxis[forwcg>0], forwcg[forwcg>0], label = "forward CTCF") #plt.scatter(xaxis[revcg > 0],revcg[revcg>0], label = "reverse CTCF") plt.xlim([60000, 75000]) plt.title(fname) plt.legend() plt.show() continue #nicePlot(show=False) #plt.subplot(414, sharex = ax) #plt.plot(xaxis, data) #plt.show() #arr = arr / np.mean(np.sum(arr, axis=1)) #ran = np.arange(len(arr)) #mask = ran[:,None] > ran[None,:] #arr[mask] = hicdata[mask] #logarr = np.log(arr + 0.0001) # noinspection PyTypeChecker #plt.imshow(logarr, vmax = np.percentile(logarr, 99.9), extent = [low, high, high, low], interpolation = "none") for st in range(60000, 75000, 1000): for size in [2000, 3000, 5000]: end = st + size if end > 75000: continue plt.xlim([st, end]) plt.savefig(os.path.join("heatmaps", "{0}_st={1}_end={2}_r=2.png".format(fname, st, end))) plt.savefig(os.path.join("heatmaps", "{0}_st={1}_end={2}_r=2.pdf".format(fname, st, end))) plt.clf() plt.show()
def displayHeatmap(): plt.figure(figsize=(5, 5)) shared_arr = mp.Array(ctypes.c_double, N**2) arr = tonumpyarray(shared_arr) arr.shape = (N, N) def doSim(i): nparr = tonumpyarray(shared_arr) SMCTran = initModel(i) for j in range(1): SMC = [] N1 = 10000 for k in range(np.random.randint(N1 // 2, N1)): SMCTran.steps(150) SMC.append(SMCTran.getSMCs()) SMC = np.concatenate(SMC, axis=1) SMC1D = SMC[0] * N + SMC[1] position, counts = np.unique(SMC1D, return_counts=True) with shared_arr.get_lock(): nparr[position] += counts print("Finished!") return None setExceptionHook() low20 = low // 10 high20 = high // 10 mydict = h5dict( "/home/magus/HiC2011/Erez2014/hg19/GM12878_inSitu-all-combined-10k_HighRes.byChr", 'r') hicdata = mydict.get_dataset("13 13")[low20:high20, low20:high20] hicdata = completeIC(hicdata) curshape = hicdata.shape newshape = (1000 * (high - low)) // (600 * 20) print(hicdata.shape, newshape) hicdata = zoomArray(hicdata, (newshape, newshape)) hicdata = np.clip(hicdata, 0, np.percentile(hicdata, 99.99)) hicdata /= np.mean(np.sum(hicdata, axis=1)) fmap(doSim, range(30), n=20) # number of threads to use. On a 20-core machine I use 20. arr = coarsegrain(arr, 20) arr = np.clip(arr, 0, np.percentile(arr, 99.9)) arr /= np.mean(np.sum(arr, axis=1)) ran = np.arange(len(arr)) mask = ran[:, None] > ran[None, :] arr[mask] = hicdata[mask] logarr = np.log(arr + 0.0001) plt.imshow(logarr, vmax=np.percentile(logarr, 99.9), extent=[low, high, high, low], interpolation="none") nicePlot()
#!/usr/bin/python import sys import numpy as np from mirnylib.numutils import completeIC inp_file = sys.argv[1] percent_cutoff = float(sys.argv[2]) out_file = sys.argv[3] # read matrix inp_matrix = np.loadtxt(inp_file) # filter matrix by row mean m = np.mean(inp_matrix,axis=1) cutoff = np.sort(m)[len(m)*percent_cutoff] i = np.where(m<cutoff) inp_matrix[i,:] = 0 inp_matrix[:,i] = 0 # IC out_matrix = completeIC(inp_matrix) np.savetxt(out_file,out_matrix,fmt='%.3e') # TODO: save in scientific notation
def run(): # Parse Arguments args, commands = getargs() # Improve the performance if you don't want to run it if commands[0] not in ['-h', '--help']: ## Root Logger Configuration logger = logging.getLogger() # Logger Level logger.setLevel(10) console = logging.StreamHandler() filehandler = logging.handlers.RotatingFileHandler(args.logFile, maxBytes = 100000, backupCount = 5) # Set level for Handlers console.setLevel('INFO') filehandler.setLevel('DEBUG') # Customizing Formatter formatter = logging.Formatter(fmt = '%(name)-14s %(levelname)-7s @ %(asctime)s: %(message)s', datefmt = '%m/%d/%y %H:%M:%S') console.setFormatter(formatter) filehandler.setFormatter(formatter) # Add Handlers logger.addHandler(console) logger.addHandler(filehandler) ## Logging for argument setting arglist = ['# ARGUMENT LIST:', '# output file prefix = %s' % args.output, '# HiC Data Path = %s' % args.path, '# chromosomes = %s' % args.chroms, '# data resolution = %s' % args.resolution, '# Peak window width = %s' % args.pw, '# Donut size = %s' % args.ww, '# Maximum donut size = %s' % args.maxww, '# Significant Level = %s' % args.siglevel, '# Genomic distance range = %s' % [args.ww * args.resolution, args.maxapart] ] argtxt = '\n'.join(arglist) logger.info('\n' + argtxt) # Package Dependencies from mirnylib.numutils import completeIC logger.info('Locating Hi-C data ...') Lib = np.load(args.path) logger.info('Calling Peaks ...') OF = open('.'.join([args.output, 'peaks', 'txt']), 'wb') head = '\t'.join(['chromLabel', 'loc_1', 'loc_2', 'IF', 'Fold-Enrichment', 'pvalue', 'qvalue']) + '\n' OF.write(head) for key in Lib.files: if ((not args.chroms) or (key.isdigit() and '#' in args.chroms) or (key in args.chroms)): logger.info('Chromosome %s ...', key) sparseH = Lib[key].reshape(1)[0] triuH = sparseH.toarray() H = triuH + triuH.T - np.diag(triuH.diagonal()) # Symmetric Matrix del sparseH, triuH # Release Memory logger.info('Perform ICE ...') cHeatMap, biases = completeIC(H, returnBias = True) logger.info('Done!') logger.info('Customize Sparse Matrix ...') chromLen = H.shape[0] num = args.maxapart // args.resolution + args.maxww + 1 Diags = [np.diagonal(H, i) for i in np.arange(num)] M = sparse.diags(Diags, np.arange(num), format = 'csr') x = np.arange(args.ww, num) y = [] cDiags = [] for i in x: diag = np.diagonal(cHeatMap, i) y.append(diag.mean()) cDiags.append(diag) cM = sparse.diags(cDiags, x, format = 'csr') IR = isotonic.IsotonicRegression(increasing = 'auto') IR.fit(x, y) del H, cHeatMap xpos, ypos, Ovalues, Fold, pvalues, qvalues = pcaller(M, cM, biases, IR, chromLen, Diags, cDiags, num, pw = args.pw, ww = args.ww, sig = args.siglevel, maxww = args.maxww, maxapart = args.maxapart, res = args.resolution) for i in xrange(xpos.size): line = '%s\t%d\t%d\t%.4g\t%.4g\t%.4g\t%.4g\n' % (key, xpos[i] * args.resolution, ypos[i] * args.resolution, Ovalues[i], Fold[i], pvalues[i], qvalues[i]) OF.write(line) OF.flush() OF.close() logger.info('Done!')
filename = "MyFolder/byChromosomeHiCDataset" #this only works with Hi-C dataset saved by chromosome dataset = h5dict(filename, 'r') #open in the "read" mode #a bit of a weird way to find chromosome number keys = dataset.keys() cisKeys = [i for i in keys if len(set(i.split())) == 1] #extract keys of the type "a a" numChromosomes = len(cisKeys) for chromosome in range(numChromosomes): chromosomeHeatmap = dataset["{0} {0}".format( chromosome)] #extracting cis heatmap #This line executes proper Iterative Correction, which accounts for regions with low coverage #It only works for cis (symmetric) heatmaps. correctedHeatmap, bias = completeIC(chromosomeHeatmap, returnBias=True) #if you want to see log-heatmap, uncomment below #plt.imshow(np.log(correctedHeatmap)) #plt.show() """ Your code here """ #Below is example of how to save data to txt #np.savetxt("corrected_chr{0}.txt.gz".format(chromosome),correctedHeatmap) #example of saving chromosomes