def fakeData(filename, combinedFilename, results): """ This is a method to create reshuffled data :param filename: filename of the data to create reshuffled data from :param combinedFilename: filename of combined data to sample reshuffled contacts from :param results: h5dict or dict-like to put reshuffled data into :return: updated h5dict or dict-like """ h1 = h5dict(filename,'r') h2 = h5dict(combinedFilename,'r') for key in h1.keys(): if hasattr(h1[key], "shape") and len(h1[key].shape) == 2: data1 = h1[key] dss = np.sum(data1, axis=1) ds = dss.sum() del data1 data2 = h2[key] coverage = np.sum(data2, axis=1) marginals = 0.5 * dss / (coverage + 0.00001) if (~np.isfinite(marginals)).sum() > 0: print("bad marginals") marginals[~np.isfinite(marginals)] = 0 tocreate = data2 * marginals[:,None] del data2 newdata = np.random.poisson(tocreate) del tocreate result = newdata + newdata.T print(key, ds, result.sum()) del newdata results[key] = result return results
def convertFile(filename, folder, gz=True): if not os.path.exists(filename): print(("Filename does not exist", filename)) raise IOError("File not found: %s" % filename) if os.path.isfile(folder): raise IOError("Supplied folder is a file! ") if not os.path.exists(folder): os.mkdir(folder) mydict = h5dict(filename, 'r') for i in list(mydict.keys()): data = mydict[i] savefile = os.path.join(folder, i) if issubclass(type(data), numpy.ndarray): print(("saving numpy array", i, "to", savefile)) if len(data.shape) > 0: if gz: savefile = savefile + ".gz" if len(data.shape) == 2: matrixToGzippedFile(data, savefile) else: numpy.savetxt(savefile, data) continue if type(data) == str: datarepr = data else: datarepr = repr(data) print(("saving data", i, "to", savefile)) with open(savefile, 'w') as f: f.write(datarepr)
def iterativeFiltering(genome_db, fragments): ''' Filter the data at the binned level and perform the iterative correction. ''' # Read resolution from the dataset. raw_heatmap = h5dict.h5dict(options.outputDir+'heatmap-res-1M.hdf5', mode='r') resolution = int(raw_heatmap['resolution']) # Create a binnedData object, load the data. BD = binnedData.binnedData(resolution, genome_db) BD.simpleLoad(options.outputDir+'heatmap-res-1M.hdf5', options.experiment) # Remove the contacts between loci located within the same bin. BD.removeDiagonal() # Remove bins with less than half of a bin sequenced. BD.removeBySequencedCount(0.5) # Remove 1% of regions with low coverage. BD.removePoorRegions(cutoff=1) # Truncate top 0.05% of interchromosomal counts (possibly, PCR blowouts). BD.truncTrans(high=0.0005) # Perform iterative correction. BD.iterativeCorrectWithoutSS() # Save the iteratively corrected heatmap. BD.export(options.experiment, options.outputDir+'IC-heatmap-res-1M.hdf5') plotting.plot_matrix(np.log(BD.dataDict[options.experiment]))
def getChromosomesMatrix(CHROMOSOME_HDF5, PLOT_CHROMOSOME): # Read hdf5 f = h5dict.h5dict(CHROMOSOME_HDF5, mode='r') genomeIdxToLabel = f['genomeIdxToLabel'] chromosomeStarts = f['chromosomeStarts'] binNumber = f['binNumber'] # Get grid location and chromosome labels chrmIdx = [] grids = [0] chrmLabels = [] for i in range(len(genomeIdxToLabel)): if genomeIdxToLabel[i] in PLOT_CHROMOSOME: chrmIdx.append(i) chrmLabels.append(genomeIdxToLabel[i]) if i == len(genomeIdxToLabel) - 1: size = binNumber - chromosomeStarts[i] else: size = chromosomeStarts[i + 1] - chromosomeStarts[i] grids.append(grids[-1] + size) # Get chromosome-wide matrix slices = [] for i in chrmIdx: m = [] for j in chrmIdx: key = str(i) + ' ' + str(j) m.append(f[key]) slices.append(np.concatenate(m, axis=1)) matrix = np.concatenate(slices, axis=0) # Return return matrix, grids, chrmLabels
def convertFile(filename,outFilename): if not os.path.exists(filename): raise IOError("File not found: %s" % filename) outDict = h5py.File(outFilename, mode = 'w') mydict = h5dict(filename, 'r') selectedKeys = ['chrms1', 'chrms2', 'cuts1', 'cuts2', 'rfragIdxs1', 'rfragIdxs2', 'strands1', 'strands2','rsites1','rsites2'] #for i in list(mydict.keys()): for i in list(selectedKeys): keyData = mydict[i] if issubclass(type(keyData), numpy.ndarray): print(("saving numpy array", i, "to", outFilename)) if issubclass(type(keyData[0]), numpy.bool_): binaryStrands = numpy.zeros(len(keyData),dtype = numpy.int8) indices = numpy.where(keyData == True) binaryStrands[indices[0]] = 1 outDict.create_dataset(i, data = binaryStrands) else: outDict.create_dataset(i, data = keyData) continue txtSavefile = i+'.txt' if type(keyData) == str: datarepr = keyData else: datarepr = repr(keyData) print(("saving data", i, "to", txtSavefile)) with open(txtSavefile, 'w') as f: f.write(datarepr) outDict.close()
def process(): global options global args global pp if (options.verbose): print >> sys.stdout, "*** START processing" fig = plt.figure() pp = PdfPages(options.outputDir+options.experiment+'.pdf') logging.basicConfig(level=logging.DEBUG) if (options.verbose): print >> sys.stdout, "** Create directories" if not os.path.exists(options.tmpDir): os.mkdir(options.tmpDir) if not os.path.exists(options.outputDir): os.mkdir(options.outputDir) if (options.verbose): print >> sys.stdout, "** Create data objects" mapped_reads = h5dict.h5dict(options.outputDir+options.experiment+'-mapped_reads.hdf5') genome_db = genome.Genome(options.genome, gapFile=options.gapFile, readChrms=['#', 'X', 'Y']) genome_db.setEnzyme(options.enzyme) bams = [] if (options.inputFormat != 'bam'): bams = mapFiles() else: bams = args[0:] if (options.verbose): print >> sys.stdout, "** Collect mapped reads" collectMappedReads(bams[0], bams[1], mapped_reads, genome_db) if (options.verbose): print >> sys.stdout, "** Filter fragments" filterFragments(genome_db) if (options.verbose): print >> sys.stdout, "** Iterative filtering of fragments" iterativeFiltering(genome_db, '-1M.hdf5') # plotting correctedScalingPlot(1000000, options.outputDir+options.experiment+'-1M.hdf5', options.experiment, genome_db) doArmPlot(1000000, options.outputDir+options.experiment+'-1M.hdf5', options.experiment, genome_db) if (options.verbose): print >> sys.stdout, "*** FINISHED processing" pp.close()
def diamondScore(dataset, size=10): """ Extract a so-called "diamond score" - inspired by Suzana Hadjur talks see Sevil Sofueva, EMBO 2013 - Supp Figure 11 (but this is a bit different from Supp Figure 11!!!) """ heatmap = 1. * h5dict(hm(dataset))["heatmap"] for _ in range(1): zeros = np.sum(heatmap, axis=0) == 0 zeros = np.nonzero(zeros)[0] heatmap[zeros] = heatmap[zeros - 1] heatmap[:, zeros] = heatmap[:, zeros - 1] mirnylib.numutils.fillDiagonal(heatmap, 0, 0) mirnylib.numutils.fillDiagonal(heatmap, 0, 1) mirnylib.numutils.fillDiagonal(heatmap, 0, -1) heatmap = trunc(heatmap, low=0, high=0.0001) heatmap = ultracorrect(heatmap) diag2value = np.mean(np.diagonal(heatmap, 2)) mirnylib.numutils.fillDiagonal(heatmap, 1.5 * diag2value, 0) mirnylib.numutils.fillDiagonal(heatmap, 1.2 * diag2value, 1) mirnylib.numutils.fillDiagonal(heatmap, 1.2 * diag2value, -1) heatmap /= np.mean(np.sum(heatmap, axis=0)) tiledHeatmap = np.hstack([heatmap, heatmap, heatmap]) tiledHeatmap = np.vstack([tiledHeatmap, tiledHeatmap, tiledHeatmap]) setExceptionHook() start = len(heatmap) end = 2 * len(heatmap) ratios = [] for mon in xrange(start, end): diamond = tiledHeatmap[mon:mon + size, mon:mon - size:-1] inds = (np.arange(len(diamond))[:, None] + np.arange(len(diamond))[None, :]) < len(diamond) ratios.append(diamond[inds].sum()) return np.array(ratios) - gaussian_filter(ratios, 30) return ratios
def loadData(self, dictLike, keyFunction=lambda x: ("%d %d" % x), mode="All", cisProtocol=h5dictMatrix, transProtocol=h5dictSparseMatrix): """ Parameters ---------- dictLike : dictionary-like structure or str either by-chromosome h5dict, generated by fragmentHiC, or any dictionary-like object with by-chromosome heatmaps keyFunction : function tuple->string Function to convert chromosome pairs (chr1, chr2) to a key in a dictLike. Default: "chr1 chr2" mode : "all" or "cis" Use or not use trans chromosome maps cisProtocol : class similar to defaultMatrix see below transProtocol : class similar to defaultMatirx see below cisProtocol and transProtocol should implement all functions, currently defined in the defaultMatrix protocol. If inhereted from defaultMatrix, it should implement proper get and set functions It cannot store the matrix itself in memory, and should forget it after any function call. """ if mode.lower() not in ["cis", "all"]: raise ValueError("Mode can be only 'cis' or 'all'") if type(dictLike) == str: try: dictLike = h5dict(dictLike, 'r') except: raise ValueError("Cannot open h5dict at filename %s" % dictLike) for myKey in self.cisKeys: try: data = dictLike[keyFunction(myKey)] except KeyError: raise KeyError("Key {0} not found in h5dict".format( keyFunction(myKey))) self.data[myKey] = cisProtocol( data, dictToSave=self._h5dict, key=myKey) if mode.lower() == "all": for myKey in self.transKeys: try: data = dictLike[keyFunction(myKey)] except KeyError: raise KeyError("Key {0} not found in h5dict".format( keyFunction(myKey))) self.data[myKey] = transProtocol( data, dictToSave=self._h5dict, key=myKey) self._checkConsistency()
def step1(hiclib_path, ## the path of hiclib folder on machine dataset='Kalhor2012NB', sraid = 'SRR071231', readlen = 40): ## each read with length 40 ''' 1. Map reads to the genome http://mirnylab.bitbucket.org/hiclib/tutorial/01_iterative_mapping.html ''' ## Adopted from hiclib tutorial import os import logging from hiclib import mapping from mirnylib import h5dict, genome logging.basicConfig(level=logging.DEBUG) # A. Map the reads iteratively. mapping.iterative_mapping( bowtie_path=hiclib_path+'/bin/bowtie2/bowtie2', bowtie_index_path=hiclib_path+'/bin/bowtie2/index/hg19', fastq_path='../data/SRA/'+dataset+'/'+sraid+'/'+sraid+'.sra', out_sam_path='../data/SRA/'+sraid+'_1.bam', min_seq_len=25, len_step=5, seq_start=0, seq_end=readlen, nthreads=12, # on intel corei7 CPUs 4 threads are as fast as # 8, but leave some room for you other applications #max_reads_per_chunk = 10000000, #optional, on low-memory machines temp_dir='../data/SRA/', # optional, keep temporary files here bowtie_flags='--very-sensitive', bash_reader=hiclib_path+'/bin/sra/bin/fastq-dump -Z') mapping.iterative_mapping( bowtie_path=hiclib_path+'/bin/bowtie2/bowtie2', bowtie_index_path=hiclib_path+'/bin/bowtie2/index/hg19', fastq_path='../data/SRA/'+dataset+'/'+sraid+'/'+sraid+'.sra', out_sam_path='../data/SRA/'+sraid+'_2.bam', min_seq_len=25, len_step=5, seq_start=readlen, seq_end=2*readlen, nthreads=12, #max_reads_per_chunk = 10000000, temp_dir='../data/SRA/', bowtie_flags='--very-sensitive', bash_reader=hiclib_path+'/bin/sra/bin/fastq-dump -Z') # B. Parse the mapped sequences into a Python data structure, # assign the ultra-sonic fragments to restriction fragments. mapped_reads = h5dict.h5dict(sraid + '_mapped_reads.hdf5') ## to local folder genome_db = genome.Genome(hiclib_path+'/fasta/hg19', readChrms=['#', 'X']) mapping.parse_sam( sam_basename1='../data/SRA/'+sraid+'_1.bam', sam_basename2='../data/SRA/'+sraid+'_2.bam', out_dict=mapped_reads, genome_db=genome_db, enzyme_name='HindIII')
def _sortData(self): if not hasattr(self, "dataSorted"): tmpfil = self.make_tempfile() mydict = h5dict(tmpfil, 'w') data = mydict.add_empty_dataset("sortedData", (self.N, ), mydtype) tmp = mydict.add_empty_dataset("trash", (self.N, ), mydtype) code = dedent(""" a = np.empty(len(chrms1), dtype = mydtype) mask = (chrms1 > chrms2) | ( (chrms1 == chrms2) & (cuts1 > cuts2)) chrms2[mask],chrms1[mask] = chrms1[mask].copy(), chrms2[mask].copy() cuts1[mask],cuts2[mask] = cuts2[mask].copy(), cuts1[mask].copy() strands1[mask],strands2[mask] = strands2[mask].copy(),strands1[mask].copy() a["chrms1"] = chrms1 a["pos1"] = cuts1 a["chrms2"] = chrms2 a["pos2"] = cuts2 a["strands1"] = strands1 a["strands2"] = strands2 """) self.evaluate(expression=code, internalVariables=[ "chrms1", "chrms2", "cuts1", "cuts2", "strands1", "strands2" ], constants={ "np": np, "mydtype": mydtype }, outVariable=("a", data)) externalMergeSort(data, tmp, sorter=mydtypeSorter, searchsorted=searchsorted, chunkSize=max(150000000, self.chunksize)) sdata = mydict.get_dataset("sortedData") c1 = self.h5dict.get_dataset("chrms1") c2 = self.h5dict.get_dataset("chrms2") p1 = self.h5dict.get_dataset("cuts1") p2 = self.h5dict.get_dataset("cuts2") s1 = self.h5dict.get_dataset("strands1") s2 = self.h5dict.get_dataset("strands2") for start, end in self._getChunks(): data = sdata[start:end] c1[start:end] = data["chrms1"] c2[start:end] = data["chrms2"] p1[start:end] = data["pos1"] p2[start:end] = data["pos2"] s1[start:end] = data["strands1"] s2[start:end] = data["strands2"] self.dataSorted = True del mydict os.remove(tmpfil) gc.collect()
def step3(hiclib_path, sraid, res=1000000): ''' 3. Filter and iteratively correct heatmaps. http://mirnylab.bitbucket.org/hiclib/tutorial/03_heatmap_processing.html ''' import matplotlib.pyplot as plt import numpy as np from mirnylib import genome from mirnylib import h5dict from mirnylib import plotting from hiclib import binnedData genome_db = genome.Genome(hiclib_path+'/fasta/hg19', readChrms=['#', 'X']) # Read resolution from the dataset. raw_heatmap = h5dict.h5dict(sraid+'_map-res%sk.hdf5'%(res/1000), mode='r') resolution = int(raw_heatmap['resolution']) # Create a binnedData object, load the data. BD = binnedData.binnedData(resolution, genome_db) BD.simpleLoad(sraid+'_map-res%sk.hdf5'%(res/1000), 'DataName') # Plot the heatmap directly. plotting.plot_matrix(np.log(BD.dataDict['DataName'])) plt.savefig(sraid+'_map-res%sk.pdf'%(res/1000)) plt.clf() # Remove the contacts between loci located within the same bin. BD.removeDiagonal() # Remove bins with less than half of a bin sequenced. BD.removeBySequencedCount(0.5) # Remove 1% of regions with low coverage. BD.removePoorRegions(cutoff=1) # Truncate top 0.05% of interchromosomal counts (possibly, PCR blowouts). BD.truncTrans(high=0.0005) # Perform iterative correction. BD.iterativeCorrectWithoutSS() # Save the iteratively corrected heatmap. BD.export('DataName', sraid+'_map-res%sk-ic.hdf5'%(res/1000)) # Plot the heatmap directly. plotting.plot_matrix(np.log(BD.dataDict['DataName'])) plt.savefig(sraid+'_map-res%sk-ic.pdf'%(res/1000)) plt.clf() # Save Bias outfile = open(sraid+"_map-res%sk-ic-bias.txt"%(res/1000), "w") for i in xrange(len(BD.chromosomeIndex)): chro = BD.genome.idx2label[BD.chromosomeIndex[i]] posi = BD.positionIndex[i] outfile.write("chr%s\t%s\t%s"%(chro, posi, posi+res)) outfile.write("\t%s"%BD.biasDict['DataName'][i]) outfile.write("\n") outfile.close()
def readDict(CHROMOSOME_HDF5): # Read file f = h5dict.h5dict(CHROMOSOME_HDF5, mode='r') resolution = f['resolution'] genomeIdxToLabel = f['genomeIdxToLabel'] # Return return f, resolution, genomeIdxToLabel
def iterativeCorrection(self, outname): mydict = h5dict(outname) for key in self.cisKeys: bychr = self.rawdata[key] corrected = completeIC(bychr, returnBias=False) mydict[key] = corrected mydict["resolution"] = self.resolution
def get_chromosomes(hm_file, genome_db, resolution, chrNumb=None): if extractResolutionFromFileName(hm_file) != resolution: print "WARNING! Provided resolution ", resolution, "does not match ", extractResolutionFromFileName( hm_file), "extracted from file name ", hm_file if "hiRes.hm" in hm_file: type = "HiRes" elif "bychr.hm" in hm_file: type = "bychr" else: print "Warning: cannot resolve type of data from filename" try: print "Warning: trying hires hic" raw_heatmap = h5dict.h5dict(fname, mode='r') #open heatmap if "0 0" in raw_heatmap.keys(): type = "HiRes" else: print "HiRes hic Failed! Assuming bychr type" type = "bychr" except: print "HiRes hic Failed! Assuming bychr type" type = "bychr" if type == "HiRes": from hiclib import highResBinnedData # Create a object, load the data. print "creating an object" hmap = highResBinnedData.HiResHiC(genome_db, resolution) print "loading data" hmap.loadData(hm_file, mode="cis") print "Data loaded" if chrNumb != None: return hmap.data[(chrNumb, chrNumb)].getData() return [ hmap.data[(i, i)].getData() for i in xrange(genome_db.chrmCount) ] #cisKeys are tuples like (N,N) where N is 0..Number_of_chrms-1 elif type == "bychr": from hiclib import binnedData print "creating an object" hmap = binnedData.binnedData(resolution, genome_db) print "loading data" hmap.simpleLoad(hm_file, "heatmap") data = hmap.dataDict["heatmap"] assert len(data) == genome_db.numBins print "Data loaded" if chrNumb != None: return data[genome_db.chrmStartsBinCont[chrNumb]:genome_db. chrmEndsBinCont[chrNumb], genome_db.chrmStartsBinCont[chrNumb]:genome_db. chrmEndsBinCont[chrNumb]] return [ data[genome_db.chrmStartsBinCont[i]:genome_db.chrmEndsBinCont[i], genome_db.chrmStartsBinCont[i]:genome_db.chrmEndsBinCont[i]] for i in xrange(genome_db.chrmCount) ] else: raise "Error: can not recognize heatmap format from file name"
def fractionCis20kb(filename): hd = h5dict(filename,'r') c1 = hd["chrms1"] c2 = hd["chrms2"] p1 = hd["cuts1"] p2 = hd["cuts2"] mask = c1 == c2 cis = mask.sum() more20kb = (np.abs(p1[mask] - p2[mask]) > 20000).sum() return more20kb / cis
def doOne(inData, saveSams=True): file1, file2, outfile = inData print("Mapping {0} and {1} into {2}".format(*inData)) for onefile in file1, file2: a = gzip.open(onefile, 'r') a.readline() length = len(a.readline()) - 1 if length < 10: raise ValueError( "Length of your sequence is {0}. Something is wrong". format(length)) minlen, step = calculateStep(length - seqSkipStart, minMapLen) mapping.iterative_mapping( bowtie_path=bowtiePath, bowtie_index_path=bowtieIndex, fastq_path=onefile, out_sam_path=os.path.join(samFolder, os.path.split(onefile)[1] + ".sam"), seq_start=seqSkipStart, min_seq_len= minlen, # for bacteria mimimal mappable length is 15 bp, so I start with something slightly longer len_step=step, # and go with a usualy step nthreads= threads, # on intel corei7 CPUs 4 threads are as fast as # 8, but leave some room for you other applications # max_reads_per_chunk = 10000000, #optional, on low-memory machines temp_dir=tmpDir, bowtie_flags=bowtieFlags, ) os.remove(file1) os.remove(file2) # Second step. Parse the mapped sequences into a Python data structure, # assign the ultra-sonic fragments to restriction fragments. mapped_reads = h5dict.h5dict(outfile) sf1, sf2 = [ os.path.join(samFolder, os.path.split(onefile)[1] + ".sam") for onefile in [file1, file2] ] mapping.parse_sam(sam_basename1=sf1, sam_basename2=sf2, out_dict=mapped_reads, genome_db=genome_db, save_seqs=False, maxReads=int(chunkSize * 1.6), IDLen=50) for i in os.listdir(samFolder): if ((os.path.split(file1)[1] in i) or (os.path.split(file2)[1] in i)) and not saveSams: print("deleting", i) os.remove(os.path.join(samFolder, i))
def export(self, filename, mode = 'cis'): mydict = h5dict(filename) if mode == 'cis': for i in self.cisKeys: data = self.data[i].getData() mydict["%d %d" % i] = data else: for i in self.allKeys: data = self.data[i].getData() mydict["%d %d" % i] = data mydict["resolution"] = self.resolution
def getNumCisReads(self): hd = h5dict(self.refined, 'r') mylen = len(hd.get_dataset("strands1")) chunks = range(0,mylen,200000000) + [mylen] chunks = zip(chunks[:-1],chunks[1:]) c1 = hd.get_dataset("chrms1") c2 = hd.get_dataset("chrms2") totsum = 0 for st,end in chunks: totsum += np.sum(c1[st:end] == c2[st:end]) return totsum
def parse_bams(chromosome_names, cell_line, path, genome_version, enzyme): if not os.path.exists(path + 'maps/' + cell_line): os.mkdir(path + 'maps/' + cell_line) for chrm_list in chromosome_names: if len(chrm_list) > 1: mapped_reads = h5dict.h5dict(path + 'maps/' + cell_line + '/mapped_reads_full.hdf5') else: mapped_reads = h5dict.h5dict(path + 'maps/' + cell_line + '/mapped_reads_' + chrm_list[0] + '.hdf5') genome_db = genome.Genome('/home/magnitov/data/genomes/' + genome_version, gapFile = 'gap.txt' , readChrms = chrm_list, forceOrder = True) mapping.parse_sam( sam_basename1 = path + 'bam/' + cell_line + '/' + cell_line + '_R1.bam', sam_basename2 = path + 'bam/' + cell_line + '/' + cell_line + '_R2.bam', out_dict = mapped_reads, genome_db = genome_db, enzyme_name = enzyme)
def getNumCisReads(self): hd = h5dict(self.refined, 'r') mylen = len(hd.get_dataset("strands1")) chunks = range(0, mylen, 200000000) + [mylen] chunks = zip(chunks[:-1], chunks[1:]) c1 = hd.get_dataset("chrms1") c2 = hd.get_dataset("chrms2") totsum = 0 for st, end in chunks: totsum += np.sum(c1[st:end] == c2[st:end]) return totsum
def export(self, name, outFilename): if not name in self.dataDict: raise ValueError("No data {name}".format(name=name)) toexport = {} toexport["heatmap"] = self.dataDict[name] toexport["resolution"] = self.resolution toexport["chromosomeStarts"] = self.chromosomeStarts myh5dict = h5dict(outFilename, mode="w") myh5dict.update(toexport)
def saveByChromosomeHeatmap(self, filename, resolution, gInfo, includeTrans=False): self.genome.setResolution(resolution) mydict = h5dict(filename) for chrom in range(self.genome.chrmCount): c1 = self.h5dict.get_dataset("chrms1") p1 = self.h5dict.get_dataset("cuts1") low = h5dictBinarySearch(c1, p1, (chrom, -1), "left") high = h5dictBinarySearch(c1, p1, (chrom, 999999999), "right") chr1 = self._getVector("chrms1", low, high) chr2 = self._getVector("chrms2", low, high) pos1 = np.array(self._getVector("mids1", low, high) // resolution, dtype=np.int32) pos2 = np.array(self._getVector("mids2", low, high) // resolution, dtype=np.int32) assert (chr1 == chrom).all() # getting sure that bincount worked args = np.argsort(chr2) chr2 = chr2[args] pos1 = pos1[args] pos2 = pos2[args] for chrom2 in range(chrom, self.genome.chrmCount): if (includeTrans == False) and (chrom2 != chrom): continue start = np.searchsorted(chr2, chrom2, "left") end = np.searchsorted(chr2, chrom2, "right") cur1 = pos1[start:end] cur2 = pos2[start:end] label = np.array(cur1, "int64") label *= self.genome.chrmLensBin[chrom2] label += cur2 maxLabel = self.genome.chrmLensBin[chrom] * \ self.genome.chrmLensBin[chrom2] counts = np.bincount(label, minlength=maxLabel) mymap = counts.reshape((self.genome.chrmLensBin[chrom], -1)) if chrom == chrom2: mymap = mymap + mymap.T fillDiagonal(mymap, np.diag(mymap).copy() / 2) mydict["%d %d" % (chrom, chrom2)] = mymap mydict['resolution'] = resolution mydict['genomeInformation'] = gInfo return
def step4(hiclib_path, sraid, res=1000000): ''' 4. Eigen vector decomposition /examples/iterativeCorrectionEigenvectorExpansion/eigenvectorAnalysis.py ''' import matplotlib.pyplot as plt import numpy as np from mirnylib import genome from mirnylib import h5dict from mirnylib import plotting from hiclib import binnedData genome_db = genome.Genome(hiclib_path + '/fasta/hg19', readChrms=['#', 'X']) # Read resolution from the dataset. raw_heatmap = h5dict.h5dict(sraid + '_map-res%sk.hdf5' % (res / 1000), mode='r') resolution = int(raw_heatmap['resolution']) # Create a binnedData object, load the data. BD = binnedData.binnedData(resolution, genome_db) BD.simpleLoad(sraid + '_map-res%sk.hdf5' % (res / 1000), 'DataName') # Do eigen decomposition BD.removeDiagonal() BD.removeBySequencedCount(0.5) BD.removeCis() BD.truncTrans(high=0.0005) BD.removePoorRegions(cutoff=1) BD.fakeCis() BD.removeZeros() BD.doEig(numPCs=30, force=True) ## First 30 EIGs BD.restoreZeros(value=0) eig = BD.eigEigenvalueDict['DataName'] eig_v = BD.EigDict['DataName'] # Plot the heatmap directly. plotting.plot_matrix(np.log(np.dot(np.dot(eig_v.T, np.diag(eig)), eig_v))) plt.savefig(sraid + '_map-res%sk-eig.pdf' % (res / 1000)) plt.clf() outfile = open(sraid + "_map-res%sk-ic-eig.txt" % (res / 1000), "w") for i in xrange(len(BD.chromosomeIndex)): chro = BD.genome.idx2label[BD.chromosomeIndex[i]] posi = BD.positionIndex[i] outfile.write("chr%s\t%s\t%s" % (chro, posi, posi + res)) for eigenvector in eig_v: outfile.write("\t%s" % eigenvector[i]) outfile.write("\n") outfile.close()
def process(): global options global args if (options.verbose): print >> sys.stdout, "*** START processing" fig = plt.gcf() logging.basicConfig(level=logging.DEBUG) if (options.verbose): print >> sys.stdout, "** Create directories" if not os.path.exists(options.tmpDir): os.mkdir(options.tmpDir) if not os.path.exists(options.outputDir): os.mkdir(options.outputDir) if (options.verbose): print >> sys.stdout, "** Create data objects" mapped_reads = h5dict.h5dict(options.outputDir+'mapped_reads.hdf5') genome_db = genome.Genome(options.genome, gapFile=options.gapFile, chrmFileTemplate='%s.fa',) # bams = [] # if (options.inputFormat != 'bam'): # bams = mapFiles() # else: # bams = args[0:] if (options.verbose): print >> sys.stdout, "** Collect mapped reads" # collectMappedReads(bams[0], bams[1], mapped_reads, genome_db) if (options.verbose): print >> sys.stdout, "** Filter fragments" fragments = filterFragments(genome_db) if (options.verbose): print >> sys.stdout, "** Iterative filtering of fragments" iterativeFiltering(genome_db, fragments) if (options.verbose): print >> sys.stdout, "*** FINISHED processing" fig.savefig(options.outputDir+options.experiment+'.pdf')
def getGenomeMatrix(GENOME_HDF5): # Read hdf5 f = h5dict.h5dict(GENOME_HDF5, mode='r') matrix = f['heatmap'] chromosomeStarts = f['chromosomeStarts'] binNumber = f['binNumber'] # Get grid location and chromosome labels grids = list(chromosomeStarts) + [binNumber] genomeIdxToLabel = f['genomeIdxToLabel'] chrmLabels = genomeIdxToLabel.values() # Return return matrix, grids, chrmLabels
def get3CProfile(CHROMOSOME_HDF5, ANCHOR_CHROMOSOME, ANCHOR, REGION_CHROMOSOME, REGION_START, REGION_END): # Read hdf5 f = h5dict.h5dict(CHROMOSOME_HDF5, mode='r') genomeIdxToLabel = f['genomeIdxToLabel'] chromosomeStarts = f['chromosomeStarts'] binNumber = f['binNumber'] # Get chromosome information for i in range(len(genomeIdxToLabel)): chrmLabel = genomeIdxToLabel[i] if chrmLabel == ANCHOR_CHROMOSOME: anchor_chrmIdx = i anchor_chrmLen = _getChrmLen(i, chromosomeStarts, binNumber) if chrmLabel == REGION_CHROMOSOME: region_chrmIdx = i region_chrmLen = _getChrmLen(i, chromosomeStarts, binNumber) # Convert coordinates to bin numbers resolution = f['resolution'] anchorBin = ANCHOR / resolution if REGION_START != None: regionStartBin = REGION_START / resolution else: regionStartBin = 0 if REGION_END != None: regionEndBin = REGION_END / resolution + 1 else: regionEndBin = region_chrmLen # Check bin numbers if anchorBin > anchor_chrmLen: print '[Error] Anchor coordinate (%s) exceeds chromosome length (%s).' % (ANCHOR, anchor_chrmLen * resolution) sys.exit(1) if regionEndBin < regionStartBin: print '[Error] Region start (%s) is larger than region end (%s).' % (regionStartBin * resolution, regionEndBin * resolution) sys.exit(1) if regionEndBin > region_chrmLen: print '[Error] Region (%s-%s) exceed chromosome length (%s).' % (regionStartBin * resolution, regionEndBin * resolution, region_chrmLen * resolution) esys.exit(1) # Get matrix key = str(anchor_chrmIdx) + ' ' + str(region_chrmIdx) matrix = f[key] matrix = matrix[anchorBin,regionStartBin:regionEndBin] # Name output figure FIGURE = CHROMOSOME_HDF5.split('/')[-1][:-5] + '_anchor_chr' + ANCHOR_CHROMOSOME + '_' + str(anchorBin * resolution) + '-' + str((anchorBin + 1) * resolution - 1) + '_region_chr' + REGION_CHROMOSOME + '_' + str(regionStartBin * resolution) + '-' + str(regionEndBin * resolution - 1) # Return return matrix, anchorBin, regionStartBin, regionEndBin, resolution, FIGURE
def step4(hiclib_path, sraid, res=1000000): ''' 4. Eigen vector decomposition /examples/iterativeCorrectionEigenvectorExpansion/eigenvectorAnalysis.py ''' import matplotlib.pyplot as plt import numpy as np from mirnylib import genome from mirnylib import h5dict from mirnylib import plotting from hiclib import binnedData genome_db = genome.Genome(hiclib_path+'/fasta/hg19', readChrms=['#', 'X']) # Read resolution from the dataset. raw_heatmap = h5dict.h5dict(sraid+'_map-res%sk.hdf5'%(res/1000), mode='r') resolution = int(raw_heatmap['resolution']) # Create a binnedData object, load the data. BD = binnedData.binnedData(resolution, genome_db) BD.simpleLoad(sraid+'_map-res%sk.hdf5'%(res/1000), 'DataName') # Do eigen decomposition BD.removeDiagonal() BD.removeBySequencedCount(0.5) BD.removeCis() BD.truncTrans(high=0.0005) BD.removePoorRegions(cutoff=1) BD.fakeCis() BD.removeZeros() BD.doEig(numPCs=30, force=True) ## First 30 EIGs BD.restoreZeros(value=0) eig = BD.eigEigenvalueDict['DataName'] eig_v = BD.EigDict['DataName'] # Plot the heatmap directly. plotting.plot_matrix(np.log(np.dot(np.dot(eig_v.T, np.diag(eig)), eig_v))) plt.savefig(sraid+'_map-res%sk-eig.pdf'%(res/1000)) plt.clf() outfile = open(sraid+"_map-res%sk-ic-eig.txt"%(res/1000), "w") for i in xrange(len(BD.chromosomeIndex)): chro = BD.genome.idx2label[BD.chromosomeIndex[i]] posi = BD.positionIndex[i] outfile.write("chr%s\t%s\t%s"%(chro, posi, posi+res)) for eigenvector in eig_v: outfile.write("\t%s"%eigenvector[i]) outfile.write("\n") outfile.close()
def merge(self, filenames): h5dicts = [h5dict(i, mode = 'r') for i in filenames] if all(["metadata" in i for i in h5dicts]): metadatas = [mydict["metadata"] for mydict in h5dicts] # print metadatas newMetadata = metadatas.pop() for oldData in metadatas: for key, value in oldData.items(): if (key in newMetadata): newMetadata[key] += value else: log.warning('The key %s can not be found in some files', key) self.metadata = newMetadata self.h5dict["metadata"] = self.metadata for name in self.vectors.keys(): res = [] IfIn = [(name in mydict.keys()) for mydict in h5dicts] if not all(IfIn): continue for mydict in h5dicts: res.append(mydict[name]) res = np.concatenate(res) self.N = len(res) self.DSnum = self.N self._setData(name, res) self.h5dict.flush() time.sleep(0.2) # allow buffers to flush Types = ['LeftType', 'RightType', 'InnerType', 'OuterType'] check = all([(i in j) for i in Types for j in h5dicts]) if check: LeftType = np.zeros(50, dtype = int) RightType = np.zeros(50, dtype = int) InnerType = np.zeros(50, dtype = int) OuterType = np.zeros(50, dtype = int) for mydict in h5dicts: LeftType += mydict['LeftType'] RightType += mydict['RightType'] InnerType += mydict['InnerType'] OuterType += mydict['OuterType'] self.h5dict['LeftType'] = LeftType self.h5dict['RightType'] = RightType self.h5dict['InnerType'] = InnerType self.h5dict['OuterType'] = OuterType
def showCmap(): """Shows Hi-C data together with the simulated data. Hi-C data created by hiclib is needed for that, but you can replace the line mydict=h5dict()... and the following line with your own data loading code. """ low = 60000 high = 75000 lowMon = low * 1000 // 600 highMon = high * 1000 // 600 low20 = low // 10 high20 = high // 10 # here Hi-C data is loaded for display purposes only..... replace it with your own code if your data is in a different format mydict = h5dict("/home/magus/HiC2011/Erez2014/hg19/GM12878_inSitu-all-combined-10k_HighRes.byChr",'r') hicdata = mydict.get_dataset("13 13")[low20:high20, low20:high20] hicdata = completeIC(hicdata) curshape = hicdata.shape newshape = (1000 * (high - low)) // (600 * 5) print(hicdata.shape, newshape) hicdata = zoomArray(hicdata, (newshape, newshape)) hicdata = np.clip(hicdata, 0, np.percentile(hicdata, 99.99)) hicdata /= np.mean(np.sum(hicdata, axis=1)) #hicdata = hm / np.mean(np.sum(hm, axis=1)) for fname in os.listdir("cmaps"): cmap = pickle.load(open(os.path.join("cmaps", fname), 'rb')) #arr = coarsegrain(cmap, 2) arr = cmap if arr.shape[0] != hicdata.shape[0]: continue print(arr.shape) arr = arr / np.mean(np.sum(arr, axis=1)) ran = np.arange(len(arr)) mask = ran[:,None] > ran[None,:] arr[mask] = hicdata[mask] logarr = np.log(arr + 0.0001) # noinspection PyTypeChecker plt.imshow(logarr, vmax = np.percentile(logarr, 99.99), vmin = np.percentile(logarr, 10), extent = [low, high, high, low], interpolation = "none") plt.savefig(os.path.join("heatmaps", fname+".png")) plt.savefig(os.path.join("heatmaps", fname+".pdf")) plt.show() plt.clf()
def extractResolutionFromFileName(fname): try: raw_heatmap = h5dict.h5dict(fname, mode='r') #open heatmap resolution = int(raw_heatmap['resolution']) #get the resolution del raw_heatmap #close heatmap return resolution except: try: if "/" in fname: fname = fname.split("/")[-1] res = fname.split("res")[-1].split("k")[0] res = int(res) * 1000 return res except: print "Warning! Unable to resolve resolution from file name" return None
def __init__(self, filename, genome, maximumMoleculeLength = 500, inMemory = False, mode = "a"): self.vectors = { # chromosomes for each read. "chrms1": "int8", "chrms2": "int8", "mids1": "int32", "mids2": "int32", # midpoint of a fragment, determined as "(start+end)/2" "fraglens1": "int32", "fraglens2": "int32", # fragment lengthes "distances": "int32", # distance between fragments. If -1, different chromosomes. # If -2, different arms. "fragids1": "int64", "fragids2": "int64", # IDs of fragments. fragIDmult * chromosome + location # distance to rsite "dists1": "int32", "dists2": "int32", # precise location of cut-site "cuts1": "int32", "cuts2": "int32", "strands1": "bool", "strands2": "bool", } self.metadata = {} #-------Initialization of the genome and parameters----- self.mode = mode self.genome = genome self.chromosomeCount = self.genome.chrmCount self.fragIDmult = self.genome.fragIDmult # used for building heatmaps self.maximumMoleculeLength = maximumMoleculeLength self.filename = os.path.abspath(os.path.expanduser(filename)) # File to save the data self.chunksize = 5000000 # Chunk size for h5dict operation, external sorting, etc. self.inMemory = inMemory self.h5dict = h5dict(self.filename, mode = mode, in_memory = inMemory) if 'chrms1' in self.h5dict.keys(): chrms1 = self.chrms1 self.DSnum = self.N = len(chrms1)
def getCoverage(filename): c = cooler.Cooler(filename) if "mm9" in filename: gen = "mm9" else: gen = "hg19" mygen = genomDict[gen] myd = h5dict(filename, 'r') coverages = [] for mychr in range(mygen.chrmCount): data = c.matrix(sparse=True, balance=False).fetch(mygen.idx2label[mychr]) coverage = np.sum(data, axis=1) coverages.append(np.array(coverage)[:,0]) assert len(coverages[-1]) == data.shape[0] return coverages
def __init__(self, genome, resolution, storageFile="inMemory", mode="w"): """ Initializes the high-resolution Hi-C data storage. Parameters ---------- genome : folder or Genome object matching Genome object or folder to load it form resolution : int Resolution (number of bp per bin) storageFile : str (optional) File to store the h5dict. File will be created. By default stores in memory mode : "w", "w-" "r+" or "a", optional Access mode to h5dict (see h5dict manual) """ inMemory = (storageFile == "inMemory") self._h5dict = h5dict(storageFile, mode=mode, in_memory=inMemory) if type(genome) == str: genome = Genome(genome, readChrms=["#", "X"]) assert isinstance(genome, Genome) self.genome = genome self.resolution = resolution self.genome.setResolution(resolution) if self.genome.numBins < 7000: print "Total number of bins in the genome is just %d" % self.genome.numBins warnings.warn( "For low-resolution analysis use binnedData, as it provides" "more analysis tools") M = self.genome.chrmCount self.cisKeys = [(i, i) for i in xrange(M)] self.transKeys = [(i, j) for i in range(M) for j in range(M) if j > i] self.allKeys = self.cisKeys + self.transKeys self.data = {} self._initChromosomes()
def map_reads(first_fq, second_fq, outfile, nice): # set the niceness of this sub-process: os.nice(nice) first_sam = first_fq.split(".fastq.gz")[0] + ".sam" second_sam = second_fq.split(".fastq.gz")[0] + ".sam" # map the first fastq file -> sam file length = check_len(first_fq) min_len, step_size = calculate_step(length - seq_skip_start, min_map_len) mapping.iterative_mapping( bowtie_path=bowtie_path, bowtie_index_path=bowtie_index, fastq_path=first_fq, out_sam_path=os.path.join(args.samdir, first_sam), min_seq_len=min_len, len_step=step_size, seq_start=seq_skip_start, nthreads=threads, bowtie_flags=bowtie_flags) # map the second fastq file -> sam file length = check_len(second_fq) min_len, step_size = calculate_step(length - seq_skip_start, min_map_len) mapping.iterative_mapping( bowtie_path=bowtie_path, bowtie_index_path=bowtie_index, fastq_path=second_fq, out_sam_path=os.path.join(args.samdir, second_sam), min_seq_len=min_len, len_step=step_size, seq_start=seq_skip_start, nthreads=threads, bowtie_flags=bowtie_flags) # parse the mapped sequences into a the hdf5 dict structure, # assign the ultra-sonic fragments to restriction fragments. <- what the hell does this even mean? out_dict = os.path.join(args.samdir, outfile) mapped_reads = h5dict.h5dict(out_dict) sf1, sf2 = [os.path.join(args.samdir, first_sam), os.path.join(args.samdir, second_sam)] mapping.parse_sam(sam_basename1=sf1, sam_basename2=sf2, out_dict=mapped_reads, genome_db=genome_db, save_seqs=False, maxReads=10000000, IDLen=50, enzyme_name='HindIII')
def __init__(self, genome, resolution, storageFile="inMemory", mode="w"): """ Initializes the high-resolution Hi-C data storage. Parameters ---------- genome : folder or Genome object matching Genome object or folder to load it form resolution : int Resolution (number of bp per bin) storageFile : str (optional) File to store the h5dict. File will be created. By default stores in memory mode : "w", "w-" "r+" or "a", optional Access mode to h5dict (see h5dict manual) """ inMemory = (storageFile == "inMemory") self._h5dict = h5dict(storageFile, mode=mode, in_memory=inMemory) if type(genome) == str: genome = Genome(genome, readChrms=["#", "X"]) assert isinstance(genome, Genome) self.genome = genome self.resolution = resolution self.genome.setResolution(resolution) if self.genome.numBins < 7000: print "Total number of bins in the genome is just %d" % self.genome.numBins warnings.warn("For low-resolution analysis use binnedData, as it provides" "more analysis tools") M = self.genome.chrmCount self.cisKeys = [(i, i) for i in xrange(M)] self.transKeys = [(i, j) for i in range(M) for j in range(M) if j > i] self.allKeys = self.cisKeys + self.transKeys self.data = {} self._initChromosomes()
def saveHeatmap(self, filename, resolution, gInfo): try: os.remove(filename) except: pass tosave = h5dict(path=filename, mode='w') heatmap = self.buildAllHeatmap(resolution) tosave['heatmap'] = heatmap del heatmap chromosomeStarts = np.array(self.genome.chrmStartsBinCont) tosave['resolution'] = resolution tosave['chromosomeStarts'] = chromosomeStarts tosave['genomeInformation'] = gInfo
def doSaddleError(filename, eig, gen, correct=False): gen = Genome("/home/magus/HiC2011/data/" + gen, readChrms=["#", "X"]) cur = 0 data = h5dict(filename,'r')["heatmap"] if correct: data = completeIC(data) gen.setResolution(getResolution(filename)) if eig == "GC": eig = np.concatenate(gen.GCBin) saddles = [] permutted = [] saddle = np.zeros((5,5), dtype = float) for i in range(100): permutted.append(np.zeros((5,5), dtype = float)) for chrom in range(gen.chrmCount): st = gen.chrmStartsBinCont[chrom] end = gen.chrmEndsBinCont[chrom] cur = data[st:end, st:end] cur = observedOverExpected(cur) mask = np.sum(cur , axis=0) > 0 cur = cur [mask] cur = cur [:, mask] GC = eig[st:end] GC = GC[mask] if len(GC) > 5: for i in range(5): for j in range(5): G1, G2 = np.percentile(GC, [20 * i, 20 * i + 20]) mask1 = (GC > G1) * (GC < G2) G1, G2 = np.percentile(GC, [20 * j, 20 * j + 20]) mask2 = (GC > G1) * (GC < G2) addition = cur[np.ix_(mask1, mask2)] addition = np.reshape(addition, (-1)) for k in range(100): resampled = np.random.choice(addition, len(addition), replace=True) permutted[k][i,j] += resampled.mean() saddle[i, j] += addition.mean() return saddle, permutted
def iterativeFiltering(genome_db, filesuffix): ''' Filter the data at the binned level and perform the iterative correction. ''' # Read resolution from the dataset. raw_heatmap = h5dict.h5dict(options.outputDir + options.experiment + filesuffix, mode='r') resolution = int(raw_heatmap['resolution']) # Create a binnedData object, load the data. BD = binnedData.binnedData(resolution, genome_db) BD.simpleLoad(options.outputDir + options.experiment + filesuffix, options.experiment) # Remove the contacts between loci located within the same bin. BD.removeDiagonal() # Remove bins with less than half of a bin sequenced. BD.removeBySequencedCount(0.5) # Remove 1% of regions with low coverage. BD.removePoorRegions(cutoff=1) # Truncate top 0.05% of interchromosomal counts (possibly, PCR blowouts). BD.truncTrans(high=0.0005) # Remove empty bins BD.removeZeros() # Perform iterative correction. BD.iterativeCorrectWithoutSS() # Save the iteratively corrected heatmap. BD.export(options.experiment, options.outputDir + options.experiment + '-IC' + filesuffix) plt.figure() plotting.plot_matrix(np.log(BD.dataDict[options.experiment])) pp.savefig()
def directionalityRatio(dataset, size=20): heatmap = 1. * h5dict(hm(dataset))["heatmap"] # extract heatmap #filling in the gaps in the heatmap. Not really needed as heatmaps are with overlaps, #so they have no gaps for _ in range(1): zeros = np.sum(heatmap, axis=0) == 0 zeros = np.nonzero(zeros)[0] heatmap[zeros] = heatmap[zeros - 1] heatmap[:, zeros] = heatmap[:, zeros - 1] #Following regular IC protocol (see 033_....py) mirnylib.numutils.fillDiagonal(heatmap, 0, 0) mirnylib.numutils.fillDiagonal(heatmap, 0, 1) mirnylib.numutils.fillDiagonal(heatmap, 0, -1) heatmap = trunc(heatmap, low=0, high=0.0001) heatmap = ultracorrect(heatmap) diag2value = np.mean(np.diagonal(heatmap, 2)) mirnylib.numutils.fillDiagonal(heatmap, 1.5 * diag2value, 0) mirnylib.numutils.fillDiagonal(heatmap, 1.2 * diag2value, 1) mirnylib.numutils.fillDiagonal(heatmap, 1.2 * diag2value, -1) heatmap /= np.mean(np.sum(heatmap, axis=0)) #Put 9 copies of the heatmap in a huge square - Caulobacter is a ring. #this is a cheap-and-dirty way to account for that tiledHeatmap = np.hstack([heatmap, heatmap, heatmap]) tiledHeatmap = np.vstack([tiledHeatmap, tiledHeatmap, tiledHeatmap]) setExceptionHook() # debug only start = len(heatmap) end = 2 * len(heatmap) ratios = [] for mon in xrange(start, end): #going through the central square upstream = tiledHeatmap[mon, mon:mon + size].sum() downstream = tiledHeatmap[mon - size:mon, mon].sum() #print upstream #print downstream ratios.append( upstream / (upstream + downstream)) #this is upstream/downstream ratio return ratios
def saveHeatmap(self, filename, resolution, countDiagonalReads = 'Once'): try: os.remove(filename) except: pass tosave = h5dict(path = filename, mode = 'w') heatmap = self.buildAllHeatmap(resolution, countDiagonalReads) tosave['heatmap'] = heatmap del heatmap chromosomeStarts = np.array(self.genome.chrmStartsBinCont) numBins = self.genome.numBins tosave['resolution'] = resolution tosave['genomeBinNum'] = numBins tosave['genomeIdxToLabel'] = self.genome.idx2label tosave['chromosomeStarts'] = chromosomeStarts
def directionalityRatio(dataset, size=20): heatmap = 1. * h5dict(hm(dataset))["heatmap"] # extract heatmap #filling in the gaps in the heatmap. Not really needed as heatmaps are with overlaps, #so they have no gaps for _ in range(1): zeros = np.sum(heatmap, axis=0) == 0 zeros = np.nonzero(zeros)[0] heatmap[zeros] = heatmap[zeros - 1] heatmap[:, zeros] = heatmap[:, zeros - 1] #Following regular IC protocol (see 033_....py) mirnylib.numutils.fillDiagonal(heatmap, 0, 0) mirnylib.numutils.fillDiagonal(heatmap, 0, 1) mirnylib.numutils.fillDiagonal(heatmap, 0, -1) heatmap = trunc(heatmap, low=0, high=0.0001) heatmap = ultracorrect(heatmap) diag2value = np.mean(np.diagonal(heatmap, 2)) mirnylib.numutils.fillDiagonal(heatmap, 1.5 * diag2value, 0) mirnylib.numutils.fillDiagonal(heatmap, 1.2 * diag2value, 1) mirnylib.numutils.fillDiagonal(heatmap, 1.2 * diag2value, -1) heatmap /= np.mean(np.sum(heatmap, axis=0)) #Put 9 copies of the heatmap in a huge square - Caulobacter is a ring. #this is a cheap-and-dirty way to account for that tiledHeatmap = np.hstack([heatmap, heatmap, heatmap]) tiledHeatmap = np.vstack([tiledHeatmap, tiledHeatmap, tiledHeatmap]) setExceptionHook() # debug only start = len(heatmap) end = 2 * len(heatmap) ratios = [] for mon in xrange(start, end): #going through the central square upstream = tiledHeatmap[mon, mon:mon + size].sum() downstream = tiledHeatmap[mon - size:mon, mon].sum() #print upstream #print downstream ratios.append(upstream / (upstream + downstream)) #this is upstream/downstream ratio return ratios
def getNumReads(self): hd = h5dict(self.refined, 'r') return len(hd.get_dataset("strands1"))
temp_dir='tmp', # optional, keep temporary files here bowtie_flags='--very-sensitive') mapping.iterative_mapping( bowtie_path=bowtiePath, bowtie_index_path=bowtieIndex, fastq_path=file2, out_sam_path='sams/%s_2.bam' % expName, min_seq_len=10, len_step=3, seq_start=0, seq_end=40, nthreads=4, # on intel corei7 CPUs 4 threads are as fast as # 8, but leave some room for you other applications #max_reads_per_chunk = 10000000, #optional, on low-memory machines temp_dir='tmp', # optional, keep temporary files here bowtie_flags='--very-sensitive') # B. Parse the mapped sequences into a Python data structure, # assign the ultra-sonic fragments to restriction fragments. mapped_reads = h5dict.h5dict('caul/%s' % expName) genome_db = genome.Genome('../data/caul', chrmFileTemplate="%s.fa", readChrms=[]) mapping.parse_sam( sam_basename1='sams/%s_1.bam' % expName, sam_basename2='sams/%s_2.bam' % expName, out_dict=mapped_reads, genome_db=genome_db, enzyme_name='BglII')
def export(self, filename): mydict = h5dict(filename) for i in self.allKeys: data = self.data[i].getData() mydict["%d %d" % i] = data mydict["resolution"] = self.resolution
def parseInputData(self, dictLike, **kwargs): import numexpr if not os.path.exists(dictLike): raise IOError('File not found: %s' % dictLike) dictLike = h5dict(dictLike, 'r') self.chrms1 = dictLike['chrms1'] self.chrms2 = dictLike['chrms2'] self.cuts1 = dictLike['cuts1'] self.cuts2 = dictLike['cuts2'] self.strands1 = dictLike['strands1'] self.strands2 = dictLike['strands2'] self.dists1 = np.abs(dictLike['rsites1'] - self.cuts1) self.dists2 = np.abs(dictLike['rsites2'] - self.cuts2) self.mids1 = (dictLike['uprsites1'] + dictLike['downrsites1']) / 2 self.mids2 = (dictLike['uprsites2'] + dictLike['downrsites2']) / 2 self.fraglens1 = np.abs( (dictLike['uprsites1'] - dictLike['downrsites1'])) self.fraglens2 = np.abs( (dictLike['uprsites2'] - dictLike['downrsites2'])) self.fragids1 = self.mids1 + np.array(self.chrms1, dtype='int64') * self.fragIDmult self.fragids2 = self.mids2 + np.array(self.chrms2, dtype='int64') * self.fragIDmult distances = np.abs(self.mids1 - self.mids2) distances[self.chrms1 != self.chrms2] = -1 self.distances = distances # Distances between restriction fragments del distances self.N = len(self.chrms1) try: dictLike['misc']['genome']['idx2label'] self.updateGenome(self.genome, oldGenome = dictLike["misc"]["genome"]["idx2label"]) except KeyError: assumedGenome = Genome(self.genome.genomePath) self.updateGenome(self.genome, oldGenome = assumedGenome) # Discard dangling ends and self-circles DSmask = (self.chrms1 >= 0) * (self.chrms2 >= 0) self.metadata['100_NormalPairs'] = DSmask.sum() sameFragMask = self.evaluate("a = (fragids1 == fragids2)", ["fragids1", "fragids2"]) * DSmask cutDifs = self.cuts2[sameFragMask] > self.cuts1[sameFragMask] s1 = self.strands1[sameFragMask] s2 = self.strands2[sameFragMask] SSDE = (s1 != s2) SS = SSDE * (cutDifs == s2) SS_N = SS.sum() SSDE_N = SSDE.sum() sameFrag_N = sameFragMask.sum() self.metadata['120_SameFragmentReads'] = sameFrag_N self.metadata['122_SelfLigationReads'] = SS_N self.metadata['124_DanglingReads'] = SSDE_N - SS_N self.metadata['126_UnknownMechanism'] = sameFrag_N - SSDE_N mask = DSmask * (-sameFragMask) del DSmask, sameFragMask noSameFrag = mask.sum() # distance between sites facing each other dist = self.evaluate("a = numexpr.evaluate('- cuts1 * (2 * strands1 -1) - " "cuts2 * (2 * strands2 - 1)')", ["cuts1", "cuts2", "strands1", "strands2"], constants={"numexpr":numexpr}) readsMolecules = self.evaluate( "a = numexpr.evaluate('(chrms1 == chrms2) & (strands1 != strands2) & (dist >=0) &" " (dist <= maximumMoleculeLength)')", internalVariables=["chrms1", "chrms2", "strands1", "strands2"], externalVariables={"dist": dist}, constants={"maximumMoleculeLength": self.maximumMoleculeLength, "numexpr": numexpr}) mask *= (readsMolecules == False) extraDE = mask.sum() self.metadata['210_ExtraDanglingReads'] = -extraDE + noSameFrag if mask.sum() == 0: raise Exception('No reads left after filtering. Please, check the input data') del dist, readsMolecules self.maskFilter(mask)
readChrms = ["#", # read all numbered chromosomes "X"] # add X chromosome for inDataset in inDatasets.values(): if not os.path.exists(inDataset): raise IOError("Raw heatmap file does not exist: {}".format(inDataset)) if not os.path.isdir(genomeFolder): raise IOError("Genome folder does not exist") # When you do this, be sure that readChrms used to save heatmap matches # readChrms that you define here! genome = Genome(genomeFolder, readChrms=readChrms) # Read resolution from one of the datasets sampleDataset = h5dict(inDatasets.values()[0], mode="r") # random dataset resolution = int(sampleDataset["resolution"]) # Define the binnedData object, load data BD = binnedData(resolution, genome, readChrms) for name, filename in inDatasets.items(): BD.simpleLoad(filename, name) BD.removeDiagonal() # Remove bins with less than half of a bin sequenced BD.removeBySequencedCount(0.5) # Remove 1% of regions with low coverage BD.removePoorRegions(cutoff=1)
parser.add_argument("-d", "--datafile", help="the dataset file to be output (default name is datasets.tsv, same dir as runs file)") args = parser.parse_args() # open and parse the runs file runs_file = open(os.path.join(args.basedir,args.runsfile),"r") runs = [run.split() for run in runs_file.readlines() if not run.startswith("#")] runs_file.close() # process each record in the runs file, write out to the data sets file datasets_file = open(os.path.join(args.basedir,args.datafile),"w") # print header for datasets file datasets_file.write("# The file has the following structure:\n") datasets_file.write("# Filename\tExperiment\tReplicate\tGenome\tRestrictionEnzyme\n") for run in runs: input_dir, experiment, replicate, genome, restriction_enzyme = run filenames = [j for j in os.listdir(os.path.join(args.basedir,input_dir)) if j.endswith(".hdf5") ] for fname in filenames: try: mydict = h5dict(os.path.join(args.basedir,input_dir,fname),'r') except: pass if "strands1" not in mydict: raise if len(mydict.get_dataset("strands1")) < 10000: raise datasets_file.write("{0}\t{1}\t{2}\t{3}\t{4}\n".format(os.path.join(input_dir,fname),experiment, replicate, genome, restriction_enzyme)) datasets_file.close()
def showAllDatasets(): setExceptionHook() #plt.figure(figsize=(25, 15)) fig = plt.figure() #size of the figure fw = fig.get_figwidth() * fig.get_dpi() fh = fig.get_figheight() * fig.get_dpi() #get subplot configuration sx, sy = subplots(len(datasets)) for j, dataset in enumerate(datasets): curPlot = plt.subplot(sx, sy, j + 1) heatmap = 1. * h5dict(hm(dataset), 'r')["heatmap"] #fill in gaps - obsolete, as heatmaps are with overlaps for _ in range(1): zeros = np.sum(heatmap, axis=0) == 0 zeros = np.nonzero(zeros)[0] heatmap[zeros] = heatmap[zeros - 1] heatmap[:, zeros] = heatmap[:, zeros - 1] #regular IC protocol mirnylib.numutils.fillDiagonal(heatmap, 0, 0) mirnylib.numutils.fillDiagonal(heatmap, 0, 1) mirnylib.numutils.fillDiagonal(heatmap, 0, -1) heatmap = trunc(heatmap, low=0, high=0.0001) heatmap = ultracorrect(heatmap) diag2value = np.mean(np.diagonal(heatmap, 2)) mirnylib.numutils.fillDiagonal(heatmap, 1.5 * diag2value, 0) mirnylib.numutils.fillDiagonal(heatmap, 1.2 * diag2value, 1) mirnylib.numutils.fillDiagonal(heatmap, 1.2 * diag2value, -1) newHeatmap = heatmap #Top highly expressed genes #genePos = [18, 56, 77, 117, 143, 215, 234, 256, 266, 286, 300, 326, 336, 367, 379] geneCoor = [1162773, 3509071, 1180887, 543099, 1953250, 2522439, 3328524, 1503879, 900483, 242693, 3677144, 3931680, 3677704, 3762707, 3480870, 3829656, 1424678, 901855, 1439056, 3678537] # here we commited to 10kb resolution - change below if you're not genePos = [i / 10000. for i in geneCoor] genePos = [] #putting lines at highly expressed genes for lpos in genePos: plt.hlines(lpos , 0, 500, linewidth=0.7, color="black", alpha=0.2, zorder=1) plt.vlines(lpos , 0, 500, linewidth=0.7, color="black", alpha=0.2, zorder=1) pass #performing adaptive smoothing smoothedHeatmap = adaptiveSmoothing(newHeatmap, 20) smoothedHeatmap /= np.mean(np.sum(heatmap, axis=0)) #print dataset, sum([np.diagonal(smoothedHeatmap, i).sum() for i in range(60, 140)]) #maps = [[smoothedHeatmap, smoothedHeatmap[:30]], # [smoothedHeatmap[:, :30], smoothedHeatmap[:30, :30]]] #smoothedHeatmap = np.hstack([np.vstack(i) for i in maps]) allx = [] ally = [] plt.title(dataset, fontsize=10) plt.imshow((smoothedHeatmap), interpolation="none", vmax=0.035, cmap="acidblues", zorder=0) #plt.imshow((smoothedHeatmap), interpolation="nearest", vmin=0, vmax=np.exp(-4.5), cmap="fall", zorder=0) plt.xticks([]) plt.yticks([]) plt.subplots_adjust(left=0.05, # the left side of the subplots of the figure right=0.95, # the right side of the subplots of the figure bottom=0.05, # the bottom of the subplots of the figure top=0.95 , # the top of the subplots of the figure wspace=0.1, # the amount of width reserved for blank space between subplots hspace=0.2) #cPickle.dump(scaling, open(dataset.split("/")[-1] + "scaling", 'w')) #plt.ylim((400, 200)) #plt.xlim((0, 200)) #code below just puts the P(s) over the heatmap N = len(smoothedHeatmap) pts = np.array([[1, 0], [N, N], [N, 0]]) p = Polygon(pts, closed=True, facecolor=(0.8, 0.8, 0.8), linewidth=0, alpha=0.7, zorder=2) ax = plt.gca() ax.add_patch(p) Bbox = matplotlib.transforms.Bbox.from_bounds(.55, .55, .35, .42) tBbox = matplotlib.transforms.TransformedBbox(Bbox, ax.transAxes).get_points() l, b, w, h = tBbox[0, 0] / fw, tBbox[0, 1] / fh, (tBbox[1, 0] - tBbox[0, 0]) / fw, (tBbox[1, 1] - tBbox[0, 1]) / fh axins = fig.add_axes([l, b, w, h], axisbg=(0, 0, 0, 0), xscale="log", yscale="log") removeAxes(ax=axins) for xlabel_i in axins.get_xticklabels(): xlabel_i.set_fontsize(6) for xlabel_i in axins.get_yticklabels(): xlabel_i.set_fontsize(6) N = len(smoothedHeatmap) st = int(0.05 * N) end = int(0.45 * N) st2 = int(0.55 * N) end2 = int(0.95 * N) axins.plot(*scaling(0.5 * (smoothedHeatmap[st:end, st:end] + smoothedHeatmap[st2:end2, st2:end2])), color="blue", label="intra-arm") if (dataset in ['Wildtype_0min_BglII_rep1', "ML2000_0hr"]): myscaling = scaling(0.5 * (smoothedHeatmap[st:end, st:end] + smoothedHeatmap[st2:end2, st2:end2])) #axins.plot(*scaling(smoothedHeatmap[st:end, end2:st2:-1]), color="green", label="inter-arm") axins.set_xlabel("kb", fontsize=6) axins.set_ylabel("Pc", fontsize=6) axins.grid() if "myscaling" in locals(): axins.plot(*myscaling, color="grey") #axins.set_xticks([]) #axins.set_yticks([]) #axins.tick_params(color="red") #axins.set_xlabel("Mb") #axins.set_ylabel("Pc") for i, line in enumerate(axins.get_xticklines() + axins.get_yticklines()): if i % 2 == 1: # odd indices line.set_visible(False) #if dataset != "Wildtype_0min_BglII_rep1": # data = cPickle.load(open("scalings/{0}".format(dataset))) # axins.plot(*data, color="blue") #axins.xscale("log") #axins.yscale("log") #end strange code plt.show()
#!/usr/bin/env python import sys from hiclib import mapping, fragmentHiC from mirnylib import h5dict, genome basedir = sys.argv[1] mapped_reads1 = h5dict.h5dict('%s/Data/Timing/mapped_reads1.hdf5' % basedir) mapped_reads2 = h5dict.h5dict('%s/Data/Timing/mapped_reads2.hdf5' % basedir) mapped_reads3 = h5dict.h5dict('%s/Data/Timing/mapped_reads3.hdf5' % basedir) genome_db = genome.Genome('%s/Data/Genome/mm9_fasta' % basedir, readChrms=['1'], chrmFileTemplate="%s.fa") mapping.parse_sam( sam_basename1='%s/Data/Timing/SRR443886_sub_1.bam' % basedir, sam_basename2='%s/Data/Timing/SRR443886_sub_2.bam' % basedir, out_dict=mapped_reads1, genome_db=genome_db, enzyme_name='NcoI') mapping.parse_sam( sam_basename1='%s/Data/Timing/SRR443887_sub_1.bam' % basedir, sam_basename2='%s/Data/Timing/SRR443887_sub_2.bam' % basedir, out_dict=mapped_reads2, genome_db=genome_db, enzyme_name='NcoI') mapping.parse_sam( sam_basename1='%s/Data/Timing/SRR443888_sub_1.bam' % basedir, sam_basename2='%s/Data/Timing/SRR443888_sub_2.bam' % basedir,
from mirnylib.h5dict import h5dict import numpy as np import sys import os genome = Genome(sys.argv[1], readChrms=["1", "2", "3", "4", "5"]) a = HiResHiC(genome, 1000000, "hiResDict", mode='w') a.loadData(dictLike="../fragmentHiC/test-1M-byChr.hm") a.removeDiagonal() a.removePoorRegions(2) a.iterativeCorrection(1e-10) b = binnedData(1000000, genome) data = {"heatmap": h5dict("../fragmentHiC/test-1M.hm")["heatmap"]} lim = b.genome.chrmEndsBinCont[-1] data["heatmap"] = data["heatmap"][:lim, :lim] b.simpleLoad(data, "data") b.removeDiagonal() b.removePoorRegions(cutoff=2) b.iterativeCorrectWithoutSS(tolerance=1e-10) a.export("testExport") def compareData(): dataHigh = a.getCombinedMatrix() dataLow = b.dataDict["data"] dataHigh /= dataHigh.mean() dataLow /= dataLow.mean()
print "Checking for numpy version..", try: nv = numpy.__version__ nums = tuple([int(i) for i in nv.split('.')[:2]]) assert nums >= (1, 6) print "Correct!" except: print "numpy version is %s" % nv print "Needs at least numpy 1.6" print "See manual for numpy installation guide" raise RuntimeError("Wrong numpy version") print "Checking for mirnylib.h5dict install..", from mirnylib.h5dict import h5dict a = h5dict() b = numpy.empty(1000000, dtype="int16") c = "bla bla bla" a["numpy"] = b a["object"] = c assert (a["numpy"] - b).sum() == 0 print "H5dict test successful!" print "Checking for joblib..", try: import joblib print "Found!" except: print "joblib not found" raise RuntimeError("joblib not found")
#!/usr/bin/env python import sys import os from hiclib import mapping, fragmentHiC from mirnylib import h5dict, genome fasta_dir, re_name, out_fname, in_dir = sys.argv[1:5] in_prefices = sys.argv[5:] basedir = os.path.split(os.path.abspath(out_fname))[0] mapped_reads = [] for prefix in in_prefices: mapped_reads.append(h5dict.h5dict('%s/%s.hdf5' % (basedir, prefix))) genome_db = genome.Genome(fasta_dir, readChrms=['#', 'X'], chrmFileTemplate="%s.fa") for i, name in enumerate(mapped_reads): mapping.parse_sam( sam_basename1="%s/%s_1.bam" % (in_dir, in_prefices[i]), sam_basename2="%s/%s_2.bam" % (in_dir, in_prefices[i]), out_dict=name, genome_db=genome_db, enzyme_name=re_name) for i, name in enumerate(mapped_reads): fragments = fragmentHiC.HiCdataset( filename='temp', genome=genome_db, maximumMoleculeLength=500, mode='w',
def saveByChromosomeHeatmap(self, filename, resolution = 40000, includeTrans = False, countDiagonalReads = "Once"): """ Saves chromosome by chromosome heatmaps to h5dict. This method is not as memory demanding as saving all x all heatmap. Keys of the h5dict are of the format ["1 1"], where chromosomes are zero-based, and there is one space between numbers. Parameters ---------- filename : str Filename of the h5dict with the output resolution : int Resolution to save heatmaps includeTrans : bool, optional Build inter-chromosomal heatmaps (default: False) countDiagonalReads : "once" or "twice" How many times to count reads in the diagonal bin """ if countDiagonalReads.lower() not in ["once", "twice"]: raise ValueError("Bad value for countDiagonalReads") self.genome.setResolution(resolution) pos1 = self.evaluate("a = np.array(mids1 / {res}, dtype = 'int32')" .format(res=resolution), "mids1") pos2 = self.evaluate("a = np.array(mids2 / {res}, dtype = 'int32')" .format(res=resolution), "mids2") chr1 = self.chrms1 chr2 = self.chrms2 # DS = self.DS # 13 bytes per read up to now, 16 total mydict = h5dict(filename) for chrom in xrange(self.genome.chrmCount): if includeTrans == True: mask = ((chr1 == chrom) + (chr2 == chrom)) else: mask = ((chr1 == chrom) * (chr2 == chrom)) # Located chromosomes and positions of chromosomes c1, c2, p1, p2 = chr1[mask], chr2[mask], pos1[mask], pos2[mask] if includeTrans == True: # moving different chromosomes to c2 # c1 == chrom now mask = (c2 == chrom) * (c1 != chrom) c1[mask], c2[mask], p1[mask], p2[mask] = c2[mask].copy(), c1[ mask].copy(), p2[mask].copy(), p1[mask].copy() del c1 # ignore c1 args = np.argsort(c2) c2 = c2[args] p1 = p1[args] p2 = p2[args] for chrom2 in xrange(chrom, self.genome.chrmCount): if (includeTrans == False) and (chrom2 != chrom): continue start = np.searchsorted(c2, chrom2, "left") end = np.searchsorted(c2, chrom2, "right") cur1 = p1[start:end] cur2 = p2[start:end] label = np.asarray(cur1, "int64") label *= self.genome.chrmLensBin[chrom2] label += cur2 maxLabel = self.genome.chrmLensBin[chrom] * \ self.genome.chrmLensBin[chrom2] counts = np.bincount(label, minlength = maxLabel) assert len(counts) == maxLabel mymap = counts.reshape((self.genome.chrmLensBin[chrom], -1)) if chrom == chrom2: mymap = mymap + mymap.T if countDiagonalReads.lower() == "once": fillDiagonal(mymap, np.diag(mymap).copy() / 2) mydict["%d %d" % (chrom, chrom2)] = mymap mydict['resolution'] = resolution return
def refine_paper(filename, create=True): """filename[0] is a list of filenames of incoming files filename[1] is a folder for outgoing file""" if create == True: for onename in filename[0]: #Parsing individual files if not os.path.exists(onename): raise StandardError("path not found: %s" % onename) TR = HiCdataset("bla", genome=genomeFolder, enzymeName="HindIII",maximumMoleculeLength=500, inMemory=True) print "\nTesting loading new data without rsite information " TR.parseInputData(dictLike=onename, enzymeToFillRsites="HindIII") #assert len(TR.DS) == 856143 #assert len(TR.ufragments) == 634572 TR.save(onename + "_parsed.frag") #Merging files alltogether, applying filters TR = HiCdataset(filename[1] + "_merged.frag",enzymeName = "HindIII", genome=genomeFolder, mode="w") TR.merge([i + "_parsed.frag" for i in filename[0]]) TR = HiCdataset("refined", genome=genomeFolder,enzymeName = "HindIII", mode="w", inMemory=True) print "\nTesting chunking during all tests" TR.chunksize = 30000 #because we do many operations, we disable autoFlush here TR.load(filename[1] + "_merged.frag") print "\nTesting Rsite filter" TR.filterRsiteStart(offset=5) #assert len(TR.DS) == 832110 print "\nTesting duplicate filter" TR.filterDuplicates(chunkSize = 30000) #assert len(TR.DS) == 830275 print "\nTesting small/large and extreme fragment filter" TR.filterLarge() #assert len(TR.DS) == 825442 TR.filterExtreme(cutH=0.005, cutL=0) TR.writeFilteringStats() #assert len(TR.DS) == 803845 #------------------------------------------- TR.printMetadata(saveTo="metadata") import cPickle stop = False mdata = cPickle.load(open("sampleMetadata")) for i in sorted(mdata.keys()): if TR.metadata[i] != mdata[i]: print "Key {0} is not consistent: should be {1}, is {2}".format(i, mdata[i], TR.metadata[i]) stop = True if stop == True: print ("""------------_ERROR_-------------- Inconsistent metadata: see above ----------------------------------------""") raise ValueError("Inconsistent Metadata") print "Testing allxall and by-chromosome heatmap counting diagonal twice" print "----> saving allxall heatmap" TR.saveHeatmap(filename[1] + "-1M.hm", 1000000, countDiagonalReads="twice") a = h5dict(filename[1] + "-1M.hm") st, end = TR.genome.chrmStartsBinCont[1], TR.genome.chrmEndsBinCont[1] st2, end2 = TR.genome.chrmStartsBinCont[2], TR.genome.chrmEndsBinCont[2] chrom1 = a["heatmap"][st:end, st:end] chrom12 = a["heatmap"][st:end, st2:end2] setExceptionHook() print "----> saving by chromosome heatmap" TR.saveByChromosomeHeatmap( filename[1] + "-1M.hm", resolution=1000000, includeTrans=True, countDiagonalReads="twice") b = h5dict(filename[1] + "-1M.hm")["1 1"] bb = h5dict(filename[1] + "-1M.hm")["1 2"] assert (b - chrom1).sum() == 0 print "Cis heatmap consistent" assert (bb - chrom12).sum() == 0 print 'Trans heatmap consistent' print a["heatmap"][::10, ::10].sum() #assert a["heatmap"][::10, ::10].sum() == 21800 print "Heatmap sum correct\n" #--------------------------------- print "Testing allxall and by-chromosome heatmap counting diagonal once" TR.saveHeatmap(filename[1] + "-1M.hm", 1000000, countDiagonalReads="once") Ta = h5dict(filename[1] + "-1M.hm") st, end = TR.genome.chrmStartsBinCont[1], TR.genome.chrmEndsBinCont[1] st2, end2 = TR.genome.chrmStartsBinCont[2], TR.genome.chrmEndsBinCont[2] chrom1 = Ta["heatmap"][st:end, st:end] chrom12 = Ta["heatmap"][st:end, st2:end2] setExceptionHook() print "----> saving by chromosome heatmap" TR.saveByChromosomeHeatmap( filename[1] + "-1M-byChr.hm", resolution=1000000, includeTrans=True, countDiagonalReads="once") TR.saveHiResHeatmapWithOverlaps(filename[1]+"-1M-highRes.hm", resolution=50000, countDiagonalReads="twice") TR.saveSuperHighResMapWithOverlaps(filename[1]+"-5k-SuperHighRes.hm", resolution=5000,chromosomes = [14], countDiagonalReads="twice") Tb = h5dict(filename[1] + "-1M-byChr.hm")["1 1"] Tbb = h5dict(filename[1] + "-1M-byChr.hm")["1 2"] assert ((Tb - chrom1) == 0).all() assert ((Tbb - chrom12) == 0).all() assert ((Tb + np.diag(np.diag(Tb))) == b).all() print "Diagonal counting methods are consistent\n" newchrom1 = chrom1.copy() for i in xrange(len(newchrom1)): newchrom1[i,i] = 2 * newchrom1[i,i] Tb = h5dict(filename[1] + "-1M-highRes.hm")["1 1"] assert np.abs(Tb.sum() - newchrom1.sum()) < 1 assert np.sum(np.abs(coarsegrain(Tb,20,True) - newchrom1)) < 500 #------------------------------ print "Testing updateGenome method" from mirnylib.genome import Genome removeChromIDs = np.array([0, 1, 1, 1, 1] + [0] * 17 + [1] + [0]) #print ((removeChromIDs[TR.chrms1] == 1) + (removeChromIDs[TR.chrms2] == 1) ).sum() t = ((removeChromIDs[TR.chrms1] == 1) * (removeChromIDs[TR.chrms2] == 1)).sum() + ((removeChromIDs[TR.chrms1] == 1) * (TR.chrms2 == -1)).sum() newGenome = Genome(genomePath=genomeFolder, readChrms=["2", "3", "4", "5", "X"]) TR.updateGenome(newGenome) assert TR.N == t a = h5dict(filename[1] + "-1M.hm")["heatmap"]
def toSparse(source, idx2label, csr = False): """ Convert intra-chromosomal contact matrices to sparse ones. Parameters ---------- source : str Hdf5 file name. idx2label : dict A dictionary for conversion between zero-based indices and string chromosome labels. csr : bool Whether to use CSR (Compressed Row Storage) format or not. """ import zipfile, tempfile from numpy.lib.format import write_array from scipy import sparse lib = h5dict(source, mode = 'r') ## Uniform numpy-structured-array format itype = np.dtype({'names':['bin1', 'bin2', 'IF'], 'formats':[np.int, np.int, np.float]}) ## Create a Zip file in NPZ case if not csr: output = source.replace('.hm', '-sparse.npz') else: output = source.replace('.hm', '-csrsparse.npz') Zip = zipfile.ZipFile(output, mode = 'w', allowZip64 = True) fd, tmpfile = tempfile.mkstemp(suffix = '-numpy.npy') os.close(fd) log.log(21, 'Sparse Matrices will be saved to %s', output) log.log(21, 'Only intra-chromosomal matrices will be taken into account') log.log(21, 'Coverting ...') count = 0 for i in lib: if (i != 'resolution') and (len(set(i.split())) == 1): # Used for the dict-like key key = idx2label[int(i.split()[0])] log.log(21, 'Chromosome %s ...', key) # 2D-Matrix H = lib[i] if not csr: # Triangle Array Triu = np.triu(H) # Sparse Matrix in Memory x, y = np.nonzero(Triu) values = Triu[x, y] temp = np.zeros(values.size, dtype = itype) temp['bin1'] = x temp['bin2'] = y temp['IF'] = values else: temp = sparse.triu(H, format = 'csr') fname = key + '.npy' fid = open(tmpfile, 'wb') try: write_array(fid, np.asanyarray(temp)) fid.close() fid = None Zip.write(tmpfile, arcname = fname) finally: if fid: fid.close() log.log(21, 'Done!') count += 1 # Store the resolution information if 'resolution' in lib: fname = 'resolution.npy' fid = open(tmpfile, 'wb') try: write_array(fid, np.asanyarray(lib['resolution'])) fid.close() fid = None Zip.write(tmpfile, arcname = fname) finally: if fid: fid.close() if count == 0: log.warning('Empty source file!') os.remove(tmpfile) Zip.close()