예제 #1
0
def fakeData(filename, combinedFilename, results):
    """
    This is a method to create reshuffled data

    :param filename: filename of the data to create reshuffled data from
    :param combinedFilename: filename of combined data to sample reshuffled contacts from
    :param results: h5dict or dict-like to put reshuffled data into
    :return: updated h5dict or dict-like
    """
    h1 = h5dict(filename,'r')
    h2 = h5dict(combinedFilename,'r')
    for key in h1.keys():
        if hasattr(h1[key], "shape") and len(h1[key].shape) == 2:
            data1 = h1[key]
            dss = np.sum(data1, axis=1)
            ds = dss.sum()
            del data1
            data2 = h2[key]
            coverage = np.sum(data2, axis=1)
            marginals = 0.5 * dss  / (coverage + 0.00001)
            if (~np.isfinite(marginals)).sum() > 0:
                print("bad marginals")
            marginals[~np.isfinite(marginals)] = 0
            tocreate = data2 * marginals[:,None]
            del data2
            newdata = np.random.poisson(tocreate)
            del tocreate
            result = newdata + newdata.T
            print(key, ds, result.sum())
            del newdata
            results[key] = result
    return results
예제 #2
0
def convertFile(filename, folder, gz=True):

    if not os.path.exists(filename):
        print(("Filename does not exist", filename))
        raise IOError("File not found: %s" % filename)
    if os.path.isfile(folder):
        raise IOError("Supplied folder is a file! ")
    if not os.path.exists(folder):
        os.mkdir(folder)

    mydict = h5dict(filename, 'r')
    for i in list(mydict.keys()):
        data = mydict[i]
        savefile = os.path.join(folder, i)
        if issubclass(type(data), numpy.ndarray):
            print(("saving numpy array", i, "to", savefile))
            if len(data.shape) > 0:
                if gz:
                    savefile = savefile + ".gz"
                if len(data.shape) == 2:
                    matrixToGzippedFile(data, savefile)
                else:
                    numpy.savetxt(savefile, data)
                continue

        if type(data) == str:
            datarepr = data
        else:
            datarepr = repr(data)
        print(("saving data", i, "to", savefile))
        with open(savefile, 'w') as f:
            f.write(datarepr)
예제 #3
0
def iterativeFiltering(genome_db, fragments):
	'''
	Filter the data at the binned level and perform the iterative correction.
	'''
	
	# Read resolution from the dataset.
	raw_heatmap = h5dict.h5dict(options.outputDir+'heatmap-res-1M.hdf5', mode='r') 
	resolution = int(raw_heatmap['resolution'])
	
	# Create a binnedData object, load the data.
	BD = binnedData.binnedData(resolution, genome_db)
	BD.simpleLoad(options.outputDir+'heatmap-res-1M.hdf5', options.experiment)

	# Remove the contacts between loci located within the same bin.
	BD.removeDiagonal()
	
	# Remove bins with less than half of a bin sequenced.
	BD.removeBySequencedCount(0.5)
	
	# Remove 1% of regions with low coverage.
	BD.removePoorRegions(cutoff=1)
	
	# Truncate top 0.05% of interchromosomal counts (possibly, PCR blowouts).
	BD.truncTrans(high=0.0005)
	
	# Perform iterative correction.
	BD.iterativeCorrectWithoutSS()

	# Save the iteratively corrected heatmap.
	BD.export(options.experiment, options.outputDir+'IC-heatmap-res-1M.hdf5')

	plotting.plot_matrix(np.log(BD.dataDict[options.experiment]))
def getChromosomesMatrix(CHROMOSOME_HDF5, PLOT_CHROMOSOME):
    # Read hdf5
    f = h5dict.h5dict(CHROMOSOME_HDF5, mode='r')
    genomeIdxToLabel = f['genomeIdxToLabel']
    chromosomeStarts = f['chromosomeStarts']
    binNumber = f['binNumber']

    # Get grid location and chromosome labels
    chrmIdx = []
    grids = [0]
    chrmLabels = []
    for i in range(len(genomeIdxToLabel)):
        if genomeIdxToLabel[i] in PLOT_CHROMOSOME:
            chrmIdx.append(i)
            chrmLabels.append(genomeIdxToLabel[i])
            if i == len(genomeIdxToLabel) - 1:
                size = binNumber - chromosomeStarts[i]
            else:
                size = chromosomeStarts[i + 1] - chromosomeStarts[i]
            grids.append(grids[-1] + size)

# Get chromosome-wide matrix
    slices = []
    for i in chrmIdx:
        m = []
        for j in chrmIdx:
            key = str(i) + ' ' + str(j)
            m.append(f[key])
        slices.append(np.concatenate(m, axis=1))
    matrix = np.concatenate(slices, axis=0)

    # Return
    return matrix, grids, chrmLabels
예제 #5
0
def convertFile(filename,outFilename):
    
	if not os.path.exists(filename):
	    raise IOError("File not found: %s" % filename)

	outDict = h5py.File(outFilename, mode = 'w')
	mydict  = h5dict(filename, 'r')

	selectedKeys = ['chrms1', 'chrms2', 'cuts1', 'cuts2', 'rfragIdxs1', 'rfragIdxs2', 'strands1', 'strands2','rsites1','rsites2']
	#for i in list(mydict.keys()):
	for i in list(selectedKeys):    
		keyData = mydict[i]
		if issubclass(type(keyData), numpy.ndarray):
			print(("saving numpy array", i, "to", outFilename))
			if issubclass(type(keyData[0]), numpy.bool_):
				binaryStrands = numpy.zeros(len(keyData),dtype = numpy.int8)
				indices = numpy.where(keyData == True)
				binaryStrands[indices[0]] = 1 
				outDict.create_dataset(i, data = binaryStrands) 
			else:
				outDict.create_dataset(i, data = keyData)
			continue

	txtSavefile = i+'.txt'
	if type(keyData) == str:
	    datarepr = keyData
	else:
	    datarepr = repr(keyData)
	print(("saving data", i, "to", txtSavefile))
	with open(txtSavefile, 'w') as f:
	    f.write(datarepr)
	    
	outDict.close()
예제 #6
0
def process():
	global options
	global args
	global pp
	
	if (options.verbose):
		print >> sys.stdout, "*** START processing"

	fig = plt.figure()
	pp = PdfPages(options.outputDir+options.experiment+'.pdf')
	
	logging.basicConfig(level=logging.DEBUG)
	
	if (options.verbose):
		print >> sys.stdout, "**  Create directories"

	if not os.path.exists(options.tmpDir):
		os.mkdir(options.tmpDir)

	if not os.path.exists(options.outputDir):
		os.mkdir(options.outputDir)
	
	if (options.verbose):
		print >> sys.stdout, "**  Create data objects"

	mapped_reads = h5dict.h5dict(options.outputDir+options.experiment+'-mapped_reads.hdf5')
	genome_db    = genome.Genome(options.genome, gapFile=options.gapFile, readChrms=['#', 'X', 'Y'])
	genome_db.setEnzyme(options.enzyme)

	bams = []
	if (options.inputFormat != 'bam'):
		bams = mapFiles()
	else:
		bams = args[0:]

	if (options.verbose):
		print >> sys.stdout, "**  Collect mapped reads"
		
	collectMappedReads(bams[0], bams[1], mapped_reads, genome_db)
	
	if (options.verbose):
		print >> sys.stdout, "**  Filter fragments"
	
	filterFragments(genome_db)
	
	if (options.verbose):
		print >> sys.stdout, "**  Iterative filtering of fragments"

	iterativeFiltering(genome_db, '-1M.hdf5')


	# plotting
	correctedScalingPlot(1000000, options.outputDir+options.experiment+'-1M.hdf5', options.experiment, genome_db)

	doArmPlot(1000000, options.outputDir+options.experiment+'-1M.hdf5', options.experiment, genome_db)

	if (options.verbose):
		print >> sys.stdout, "*** FINISHED processing"
	
	pp.close()
예제 #7
0
def diamondScore(dataset, size=10):
    """
    Extract a so-called "diamond score" - inspired by  Suzana Hadjur talks
    see Sevil Sofueva, EMBO 2013 - Supp Figure 11
    (but this is a bit different from Supp Figure 11!!!)
    """
    heatmap = 1. * h5dict(hm(dataset))["heatmap"]
    for _ in range(1):
        zeros = np.sum(heatmap, axis=0) == 0
        zeros = np.nonzero(zeros)[0]
        heatmap[zeros] = heatmap[zeros - 1]
        heatmap[:, zeros] = heatmap[:, zeros - 1]
    mirnylib.numutils.fillDiagonal(heatmap, 0, 0)
    mirnylib.numutils.fillDiagonal(heatmap, 0, 1)
    mirnylib.numutils.fillDiagonal(heatmap, 0, -1)
    heatmap = trunc(heatmap, low=0, high=0.0001)
    heatmap = ultracorrect(heatmap)
    diag2value = np.mean(np.diagonal(heatmap, 2))
    mirnylib.numutils.fillDiagonal(heatmap, 1.5 * diag2value, 0)
    mirnylib.numutils.fillDiagonal(heatmap, 1.2 * diag2value, 1)
    mirnylib.numutils.fillDiagonal(heatmap, 1.2 * diag2value, -1)
    heatmap /= np.mean(np.sum(heatmap, axis=0))
    tiledHeatmap = np.hstack([heatmap, heatmap, heatmap])
    tiledHeatmap = np.vstack([tiledHeatmap, tiledHeatmap, tiledHeatmap])
    setExceptionHook()
    start = len(heatmap)
    end = 2 * len(heatmap)
    ratios = []
    for mon in xrange(start, end):
        diamond = tiledHeatmap[mon:mon + size, mon:mon - size:-1]
        inds = (np.arange(len(diamond))[:, None] + np.arange(len(diamond))[None, :]) < len(diamond)
        ratios.append(diamond[inds].sum())
    return np.array(ratios) - gaussian_filter(ratios, 30)

    return ratios
예제 #8
0
    def loadData(self, dictLike,
                 keyFunction=lambda x: ("%d %d" % x),
                 mode="All",
                 cisProtocol=h5dictMatrix,
                 transProtocol=h5dictSparseMatrix):
        """
        Parameters
        ----------

        dictLike : dictionary-like structure or str
            either by-chromosome h5dict, generated by fragmentHiC, or
            any dictionary-like object with by-chromosome heatmaps
        keyFunction : function tuple->string
            Function to convert chromosome pairs (chr1, chr2) to a
            key in a dictLike. Default: "chr1 chr2"
        mode : "all" or "cis"
            Use or not use trans chromosome maps
        cisProtocol : class similar to defaultMatrix
            see below
        transProtocol : class similar to defaultMatirx
            see below


        cisProtocol and transProtocol should implement all functions, currently
        defined in the defaultMatrix protocol. If inhereted from defaultMatrix,
        it should implement proper get and set functions It cannot store the
        matrix itself in memory, and should forget it after any function call.
        """

        if mode.lower() not in ["cis", "all"]:
            raise ValueError("Mode can be only 'cis' or 'all'")

        if type(dictLike) == str:
            try:
                dictLike = h5dict(dictLike, 'r')
            except:
                raise ValueError("Cannot open h5dict at filename %s" %
                                 dictLike)

        for myKey in self.cisKeys:
            try:
                data = dictLike[keyFunction(myKey)]
            except KeyError:
                raise KeyError("Key {0} not found in h5dict".format(
                    keyFunction(myKey)))

            self.data[myKey] = cisProtocol(
                data, dictToSave=self._h5dict, key=myKey)

        if mode.lower() == "all":
            for myKey in self.transKeys:
                try:
                    data = dictLike[keyFunction(myKey)]
                except KeyError:
                    raise KeyError("Key {0} not found in h5dict".format(
                        keyFunction(myKey)))
                self.data[myKey] = transProtocol(
                    data, dictToSave=self._h5dict, key=myKey)

        self._checkConsistency()
예제 #9
0
    def loadData(self, dictLike,
                 keyFunction=lambda x: ("%d %d" % x),
                 mode="All",
                 cisProtocol=h5dictMatrix,
                 transProtocol=h5dictSparseMatrix):
        """
        Parameters
        ----------

        dictLike : dictionary-like structure or str
            either by-chromosome h5dict, generated by fragmentHiC, or
            any dictionary-like object with by-chromosome heatmaps
        keyFunction : function tuple->string
            Function to convert chromosome pairs (chr1, chr2) to a
            key in a dictLike. Default: "chr1 chr2"
        mode : "all" or "cis"
            Use or not use trans chromosome maps
        cisProtocol : class similar to defaultMatrix
            see below
        transProtocol : class similar to defaultMatirx
            see below


        cisProtocol and transProtocol should implement all functions, currently
        defined in the defaultMatrix protocol. If inhereted from defaultMatrix,
        it should implement proper get and set functions It cannot store the
        matrix itself in memory, and should forget it after any function call.
        """

        if mode.lower() not in ["cis", "all"]:
            raise ValueError("Mode can be only 'cis' or 'all'")

        if type(dictLike) == str:
            try:
                dictLike = h5dict(dictLike, 'r')
            except:
                raise ValueError("Cannot open h5dict at filename %s" %
                                 dictLike)

        for myKey in self.cisKeys:
            try:
                data = dictLike[keyFunction(myKey)]
            except KeyError:
                raise KeyError("Key {0} not found in h5dict".format(
                    keyFunction(myKey)))

            self.data[myKey] = cisProtocol(
                data, dictToSave=self._h5dict, key=myKey)

        if mode.lower() == "all":
            for myKey in self.transKeys:
                try:
                    data = dictLike[keyFunction(myKey)]
                except KeyError:
                    raise KeyError("Key {0} not found in h5dict".format(
                        keyFunction(myKey)))
                self.data[myKey] = transProtocol(
                    data, dictToSave=self._h5dict, key=myKey)

        self._checkConsistency()
예제 #10
0
def step1(hiclib_path, ## the path of hiclib folder on machine
          dataset='Kalhor2012NB', 
          sraid = 'SRR071231', 
          readlen = 40): ## each read with length 40
    ''' 1. Map reads to the genome
        http://mirnylab.bitbucket.org/hiclib/tutorial/01_iterative_mapping.html
    '''

    ## Adopted from hiclib tutorial
    import os
    import logging
    from hiclib import mapping
    from mirnylib import h5dict, genome

    logging.basicConfig(level=logging.DEBUG)

    # A. Map the reads iteratively.
    mapping.iterative_mapping(
        bowtie_path=hiclib_path+'/bin/bowtie2/bowtie2',
        bowtie_index_path=hiclib_path+'/bin/bowtie2/index/hg19',
        fastq_path='../data/SRA/'+dataset+'/'+sraid+'/'+sraid+'.sra',
        out_sam_path='../data/SRA/'+sraid+'_1.bam',
        min_seq_len=25,
        len_step=5,
        seq_start=0,
        seq_end=readlen,
        nthreads=12, # on intel corei7 CPUs 4 threads are as fast as
                     # 8, but leave some room for you other applications
        #max_reads_per_chunk = 10000000,  #optional, on low-memory machines
        temp_dir='../data/SRA/',  # optional, keep temporary files here
        bowtie_flags='--very-sensitive',
        bash_reader=hiclib_path+'/bin/sra/bin/fastq-dump -Z')

    mapping.iterative_mapping(
        bowtie_path=hiclib_path+'/bin/bowtie2/bowtie2',
        bowtie_index_path=hiclib_path+'/bin/bowtie2/index/hg19',
        fastq_path='../data/SRA/'+dataset+'/'+sraid+'/'+sraid+'.sra',
        out_sam_path='../data/SRA/'+sraid+'_2.bam',
        min_seq_len=25,
        len_step=5,
        seq_start=readlen,
        seq_end=2*readlen,
        nthreads=12,  
        #max_reads_per_chunk = 10000000, 
        temp_dir='../data/SRA/',  
        bowtie_flags='--very-sensitive',
        bash_reader=hiclib_path+'/bin/sra/bin/fastq-dump -Z')

    # B. Parse the mapped sequences into a Python data structure,
    #    assign the ultra-sonic fragments to restriction fragments.
    mapped_reads = h5dict.h5dict(sraid + '_mapped_reads.hdf5') ## to local folder
    genome_db    = genome.Genome(hiclib_path+'/fasta/hg19', readChrms=['#', 'X'])

    mapping.parse_sam(
        sam_basename1='../data/SRA/'+sraid+'_1.bam',
        sam_basename2='../data/SRA/'+sraid+'_2.bam',
        out_dict=mapped_reads,
        genome_db=genome_db, 
        enzyme_name='HindIII')
예제 #11
0
    def _sortData(self):

        if not hasattr(self, "dataSorted"):
            tmpfil = self.make_tempfile()
            mydict = h5dict(tmpfil, 'w')
            data = mydict.add_empty_dataset("sortedData", (self.N, ), mydtype)
            tmp = mydict.add_empty_dataset("trash", (self.N, ), mydtype)
            code = dedent("""
            a = np.empty(len(chrms1), dtype = mydtype)
            mask = (chrms1 > chrms2) | ( (chrms1 == chrms2) & (cuts1 > cuts2))

            chrms2[mask],chrms1[mask] = chrms1[mask].copy(), chrms2[mask].copy()
            cuts1[mask],cuts2[mask] = cuts2[mask].copy(), cuts1[mask].copy()
            strands1[mask],strands2[mask] = strands2[mask].copy(),strands1[mask].copy()

            a["chrms1"] = chrms1
            a["pos1"] = cuts1
            a["chrms2"] = chrms2
            a["pos2"] = cuts2
            a["strands1"] = strands1
            a["strands2"] = strands2
            """)
            self.evaluate(expression=code,
                          internalVariables=[
                              "chrms1", "chrms2", "cuts1", "cuts2", "strands1",
                              "strands2"
                          ],
                          constants={
                              "np": np,
                              "mydtype": mydtype
                          },
                          outVariable=("a", data))

            externalMergeSort(data,
                              tmp,
                              sorter=mydtypeSorter,
                              searchsorted=searchsorted,
                              chunkSize=max(150000000, self.chunksize))
            sdata = mydict.get_dataset("sortedData")

            c1 = self.h5dict.get_dataset("chrms1")
            c2 = self.h5dict.get_dataset("chrms2")
            p1 = self.h5dict.get_dataset("cuts1")
            p2 = self.h5dict.get_dataset("cuts2")
            s1 = self.h5dict.get_dataset("strands1")
            s2 = self.h5dict.get_dataset("strands2")

            for start, end in self._getChunks():
                data = sdata[start:end]
                c1[start:end] = data["chrms1"]
                c2[start:end] = data["chrms2"]
                p1[start:end] = data["pos1"]
                p2[start:end] = data["pos2"]
                s1[start:end] = data["strands1"]
                s2[start:end] = data["strands2"]
            self.dataSorted = True
            del mydict
            os.remove(tmpfil)
            gc.collect()
예제 #12
0
def step3(hiclib_path, sraid, res=1000000):
    ''' 3. Filter and iteratively correct heatmaps.
        http://mirnylab.bitbucket.org/hiclib/tutorial/03_heatmap_processing.html
    '''
    import matplotlib.pyplot as plt
    import numpy as np

    from mirnylib import genome
    from mirnylib import h5dict
    from mirnylib import plotting
    from hiclib import binnedData

    genome_db = genome.Genome(hiclib_path+'/fasta/hg19', readChrms=['#', 'X'])

    # Read resolution from the dataset.
    raw_heatmap = h5dict.h5dict(sraid+'_map-res%sk.hdf5'%(res/1000), mode='r') 
    resolution = int(raw_heatmap['resolution'])

    # Create a binnedData object, load the data.
    BD = binnedData.binnedData(resolution, genome_db)
    BD.simpleLoad(sraid+'_map-res%sk.hdf5'%(res/1000), 'DataName')

    # Plot the heatmap directly.
    plotting.plot_matrix(np.log(BD.dataDict['DataName']))
    plt.savefig(sraid+'_map-res%sk.pdf'%(res/1000))
    plt.clf()

    # Remove the contacts between loci located within the same bin.
    BD.removeDiagonal()

    # Remove bins with less than half of a bin sequenced.
    BD.removeBySequencedCount(0.5)

    # Remove 1% of regions with low coverage.
    BD.removePoorRegions(cutoff=1)

    # Truncate top 0.05% of interchromosomal counts (possibly, PCR blowouts).
    BD.truncTrans(high=0.0005)

    # Perform iterative correction.
    BD.iterativeCorrectWithoutSS()

    # Save the iteratively corrected heatmap.
    BD.export('DataName', sraid+'_map-res%sk-ic.hdf5'%(res/1000))

    # Plot the heatmap directly.
    plotting.plot_matrix(np.log(BD.dataDict['DataName']))
    plt.savefig(sraid+'_map-res%sk-ic.pdf'%(res/1000))
    plt.clf()

    # Save Bias
    outfile = open(sraid+"_map-res%sk-ic-bias.txt"%(res/1000), "w")
    for i in xrange(len(BD.chromosomeIndex)):
        chro = BD.genome.idx2label[BD.chromosomeIndex[i]]
        posi = BD.positionIndex[i]
        outfile.write("chr%s\t%s\t%s"%(chro, posi, posi+res))
        outfile.write("\t%s"%BD.biasDict['DataName'][i])
        outfile.write("\n")
    outfile.close()
def readDict(CHROMOSOME_HDF5):
	# Read file
	f = h5dict.h5dict(CHROMOSOME_HDF5, mode='r')
	resolution = f['resolution']
	genomeIdxToLabel = f['genomeIdxToLabel']
	
	# Return
	return f, resolution, genomeIdxToLabel
예제 #14
0
    def iterativeCorrection(self, outname):

        mydict = h5dict(outname)
        for key in self.cisKeys:
            bychr = self.rawdata[key]
            corrected = completeIC(bychr, returnBias=False)
            mydict[key] = corrected
        mydict["resolution"] = self.resolution
예제 #15
0
def get_chromosomes(hm_file, genome_db, resolution, chrNumb=None):
    if extractResolutionFromFileName(hm_file) != resolution:
        print "WARNING! Provided resolution ", resolution, "does not match ", extractResolutionFromFileName(
            hm_file), "extracted from file name ", hm_file
    if "hiRes.hm" in hm_file:
        type = "HiRes"
    elif "bychr.hm" in hm_file:
        type = "bychr"
    else:
        print "Warning: cannot resolve type of data from filename"
        try:
            print "Warning: trying hires hic"
            raw_heatmap = h5dict.h5dict(fname, mode='r')  #open heatmap
            if "0 0" in raw_heatmap.keys():
                type = "HiRes"
            else:
                print "HiRes hic Failed! Assuming bychr type"
                type = "bychr"
        except:
            print "HiRes hic Failed! Assuming bychr type"
            type = "bychr"
    if type == "HiRes":
        from hiclib import highResBinnedData
        # Create a  object, load the data.
        print "creating an object"
        hmap = highResBinnedData.HiResHiC(genome_db, resolution)
        print "loading data"
        hmap.loadData(hm_file, mode="cis")
        print "Data loaded"
        if chrNumb != None:
            return hmap.data[(chrNumb, chrNumb)].getData()
        return [
            hmap.data[(i, i)].getData() for i in xrange(genome_db.chrmCount)
        ]
        #cisKeys are tuples like (N,N) where N is 0..Number_of_chrms-1
    elif type == "bychr":
        from hiclib import binnedData
        print "creating an object"
        hmap = binnedData.binnedData(resolution, genome_db)

        print "loading data"
        hmap.simpleLoad(hm_file, "heatmap")
        data = hmap.dataDict["heatmap"]
        assert len(data) == genome_db.numBins
        print "Data loaded"
        if chrNumb != None:
            return data[genome_db.chrmStartsBinCont[chrNumb]:genome_db.
                        chrmEndsBinCont[chrNumb],
                        genome_db.chrmStartsBinCont[chrNumb]:genome_db.
                        chrmEndsBinCont[chrNumb]]
        return [
            data[genome_db.chrmStartsBinCont[i]:genome_db.chrmEndsBinCont[i],
                 genome_db.chrmStartsBinCont[i]:genome_db.chrmEndsBinCont[i]]
            for i in xrange(genome_db.chrmCount)
        ]
    else:
        raise "Error: can not recognize heatmap format from file name"
예제 #16
0
def fractionCis20kb(filename):
    hd = h5dict(filename,'r')
    c1 = hd["chrms1"]
    c2 = hd["chrms2"]
    p1 = hd["cuts1"]
    p2 = hd["cuts2"]
    mask = c1 == c2
    cis = mask.sum()
    more20kb = (np.abs(p1[mask] - p2[mask]) > 20000).sum()
    return more20kb / cis
예제 #17
0
    def doOne(inData, saveSams=True):
        file1, file2, outfile = inData
        print("Mapping {0} and {1} into {2}".format(*inData))

        for onefile in file1, file2:
            a = gzip.open(onefile, 'r')
            a.readline()
            length = len(a.readline()) - 1
            if length < 10:
                raise ValueError(
                    "Length of your sequence is {0}. Something is wrong".
                    format(length))
            minlen, step = calculateStep(length - seqSkipStart, minMapLen)

            mapping.iterative_mapping(
                bowtie_path=bowtiePath,
                bowtie_index_path=bowtieIndex,
                fastq_path=onefile,
                out_sam_path=os.path.join(samFolder,
                                          os.path.split(onefile)[1] + ".sam"),
                seq_start=seqSkipStart,
                min_seq_len=
                minlen,  # for bacteria mimimal mappable length is 15 bp, so I start with something slightly longer
                len_step=step,  # and go with a usualy step
                nthreads=
                threads,  # on intel corei7 CPUs 4 threads are as fast as
                # 8, but leave some room for you other applications
                # max_reads_per_chunk = 10000000,  #optional, on low-memory machines
                temp_dir=tmpDir,
                bowtie_flags=bowtieFlags,
            )

        os.remove(file1)
        os.remove(file2)

        # Second step. Parse the mapped sequences into a Python data structure,
        #    assign the ultra-sonic fragments to restriction fragments.
        mapped_reads = h5dict.h5dict(outfile)
        sf1, sf2 = [
            os.path.join(samFolder,
                         os.path.split(onefile)[1] + ".sam")
            for onefile in [file1, file2]
        ]
        mapping.parse_sam(sam_basename1=sf1,
                          sam_basename2=sf2,
                          out_dict=mapped_reads,
                          genome_db=genome_db,
                          save_seqs=False,
                          maxReads=int(chunkSize * 1.6),
                          IDLen=50)
        for i in os.listdir(samFolder):
            if ((os.path.split(file1)[1] in i) or
                (os.path.split(file2)[1] in i)) and not saveSams:
                print("deleting", i)
                os.remove(os.path.join(samFolder, i))
예제 #18
0
 def export(self, filename, mode = 'cis'):
     mydict = h5dict(filename)
     if mode == 'cis':
         for i in self.cisKeys:
             data = self.data[i].getData()
             mydict["%d %d" % i] = data
     else:
         for i in self.allKeys:
             data = self.data[i].getData()
             mydict["%d %d" % i] = data
     mydict["resolution"] = self.resolution
예제 #19
0
 def getNumCisReads(self):
     hd = h5dict(self.refined, 'r')
     mylen = len(hd.get_dataset("strands1"))
     chunks = range(0,mylen,200000000) +  [mylen]
     chunks = zip(chunks[:-1],chunks[1:])
     c1 = hd.get_dataset("chrms1")
     c2 = hd.get_dataset("chrms2")
     totsum = 0 
     for st,end in chunks:
         totsum += np.sum(c1[st:end] == c2[st:end])
     return totsum
예제 #20
0
def parse_bams(chromosome_names, cell_line, path, genome_version, enzyme):

    if not os.path.exists(path + 'maps/' + cell_line):
        os.mkdir(path + 'maps/' + cell_line)

    for chrm_list in chromosome_names:

        if len(chrm_list) > 1:
            mapped_reads = h5dict.h5dict(path + 'maps/' + cell_line +  '/mapped_reads_full.hdf5')
        else:
            mapped_reads = h5dict.h5dict(path + 'maps/' + cell_line +  '/mapped_reads_' + chrm_list[0] + '.hdf5')
        
        genome_db = genome.Genome('/home/magnitov/data/genomes/' + genome_version, gapFile = 'gap.txt' , readChrms = chrm_list, forceOrder = True)

        mapping.parse_sam(
            sam_basename1 = path + 'bam/' + cell_line + '/' + cell_line + '_R1.bam',
            sam_basename2 = path + 'bam/' + cell_line + '/' + cell_line + '_R2.bam',
            out_dict = mapped_reads,
            genome_db = genome_db,
            enzyme_name = enzyme)
예제 #21
0
 def getNumCisReads(self):
     hd = h5dict(self.refined, 'r')
     mylen = len(hd.get_dataset("strands1"))
     chunks = range(0, mylen, 200000000) + [mylen]
     chunks = zip(chunks[:-1], chunks[1:])
     c1 = hd.get_dataset("chrms1")
     c2 = hd.get_dataset("chrms2")
     totsum = 0
     for st, end in chunks:
         totsum += np.sum(c1[st:end] == c2[st:end])
     return totsum
예제 #22
0
    def export(self, name, outFilename):

        if not name in self.dataDict:
            raise ValueError("No data {name}".format(name=name))

        toexport = {}
        toexport["heatmap"] = self.dataDict[name]
        toexport["resolution"] = self.resolution
        toexport["chromosomeStarts"] = self.chromosomeStarts
        myh5dict = h5dict(outFilename, mode="w")
        myh5dict.update(toexport)
예제 #23
0
    def saveByChromosomeHeatmap(self,
                                filename,
                                resolution,
                                gInfo,
                                includeTrans=False):

        self.genome.setResolution(resolution)

        mydict = h5dict(filename)

        for chrom in range(self.genome.chrmCount):
            c1 = self.h5dict.get_dataset("chrms1")
            p1 = self.h5dict.get_dataset("cuts1")
            low = h5dictBinarySearch(c1, p1, (chrom, -1), "left")
            high = h5dictBinarySearch(c1, p1, (chrom, 999999999), "right")

            chr1 = self._getVector("chrms1", low, high)
            chr2 = self._getVector("chrms2", low, high)
            pos1 = np.array(self._getVector("mids1", low, high) // resolution,
                            dtype=np.int32)
            pos2 = np.array(self._getVector("mids2", low, high) // resolution,
                            dtype=np.int32)

            assert (chr1 == chrom).all()  # getting sure that bincount worked

            args = np.argsort(chr2)
            chr2 = chr2[args]
            pos1 = pos1[args]
            pos2 = pos2[args]

            for chrom2 in range(chrom, self.genome.chrmCount):
                if (includeTrans == False) and (chrom2 != chrom):
                    continue
                start = np.searchsorted(chr2, chrom2, "left")
                end = np.searchsorted(chr2, chrom2, "right")
                cur1 = pos1[start:end]
                cur2 = pos2[start:end]
                label = np.array(cur1, "int64")
                label *= self.genome.chrmLensBin[chrom2]
                label += cur2
                maxLabel = self.genome.chrmLensBin[chrom] * \
                           self.genome.chrmLensBin[chrom2]
                counts = np.bincount(label, minlength=maxLabel)
                mymap = counts.reshape((self.genome.chrmLensBin[chrom], -1))
                if chrom == chrom2:
                    mymap = mymap + mymap.T
                    fillDiagonal(mymap, np.diag(mymap).copy() / 2)
                mydict["%d %d" % (chrom, chrom2)] = mymap

        mydict['resolution'] = resolution
        mydict['genomeInformation'] = gInfo

        return
예제 #24
0
def step4(hiclib_path, sraid, res=1000000):
    ''' 4. Eigen vector decomposition
    /examples/iterativeCorrectionEigenvectorExpansion/eigenvectorAnalysis.py
    '''
    import matplotlib.pyplot as plt
    import numpy as np
    from mirnylib import genome
    from mirnylib import h5dict
    from mirnylib import plotting
    from hiclib import binnedData

    genome_db = genome.Genome(hiclib_path + '/fasta/hg19',
                              readChrms=['#', 'X'])

    # Read resolution from the dataset.
    raw_heatmap = h5dict.h5dict(sraid + '_map-res%sk.hdf5' % (res / 1000),
                                mode='r')
    resolution = int(raw_heatmap['resolution'])

    # Create a binnedData object, load the data.
    BD = binnedData.binnedData(resolution, genome_db)
    BD.simpleLoad(sraid + '_map-res%sk.hdf5' % (res / 1000), 'DataName')

    # Do eigen decomposition
    BD.removeDiagonal()
    BD.removeBySequencedCount(0.5)
    BD.removeCis()
    BD.truncTrans(high=0.0005)
    BD.removePoorRegions(cutoff=1)
    BD.fakeCis()
    BD.removeZeros()
    BD.doEig(numPCs=30, force=True)  ## First 30 EIGs
    BD.restoreZeros(value=0)

    eig = BD.eigEigenvalueDict['DataName']
    eig_v = BD.EigDict['DataName']

    # Plot the heatmap directly.
    plotting.plot_matrix(np.log(np.dot(np.dot(eig_v.T, np.diag(eig)), eig_v)))
    plt.savefig(sraid + '_map-res%sk-eig.pdf' % (res / 1000))
    plt.clf()

    outfile = open(sraid + "_map-res%sk-ic-eig.txt" % (res / 1000), "w")
    for i in xrange(len(BD.chromosomeIndex)):
        chro = BD.genome.idx2label[BD.chromosomeIndex[i]]
        posi = BD.positionIndex[i]
        outfile.write("chr%s\t%s\t%s" % (chro, posi, posi + res))
        for eigenvector in eig_v:
            outfile.write("\t%s" % eigenvector[i])
        outfile.write("\n")
    outfile.close()
예제 #25
0
def process():
	global options
	global args
	
	if (options.verbose):
		print >> sys.stdout, "*** START processing"

	fig = plt.gcf()

	logging.basicConfig(level=logging.DEBUG)
	
	if (options.verbose):
		print >> sys.stdout, "**  Create directories"

	if not os.path.exists(options.tmpDir):
		os.mkdir(options.tmpDir)

	if not os.path.exists(options.outputDir):
		os.mkdir(options.outputDir)
	
	if (options.verbose):
		print >> sys.stdout, "**  Create data objects"

	mapped_reads = h5dict.h5dict(options.outputDir+'mapped_reads.hdf5')
	genome_db    = genome.Genome(options.genome, gapFile=options.gapFile, chrmFileTemplate='%s.fa',)

#	bams = []
#	if (options.inputFormat != 'bam'):
#		bams = mapFiles()
#	else:
#		bams = args[0:]

	if (options.verbose):
		print >> sys.stdout, "**  Collect mapped reads"
		
#	collectMappedReads(bams[0], bams[1], mapped_reads, genome_db)
	
	if (options.verbose):
		print >> sys.stdout, "**  Filter fragments"
	
	fragments = filterFragments(genome_db)
	
	if (options.verbose):
		print >> sys.stdout, "**  Iterative filtering of fragments"

	iterativeFiltering(genome_db, fragments)
	
	if (options.verbose):
		print >> sys.stdout, "*** FINISHED processing"
	
	fig.savefig(options.outputDir+options.experiment+'.pdf')	
def getGenomeMatrix(GENOME_HDF5):
    # Read hdf5
    f = h5dict.h5dict(GENOME_HDF5, mode='r')
    matrix = f['heatmap']
    chromosomeStarts = f['chromosomeStarts']
    binNumber = f['binNumber']

    # Get grid location and chromosome labels
    grids = list(chromosomeStarts) + [binNumber]
    genomeIdxToLabel = f['genomeIdxToLabel']
    chrmLabels = genomeIdxToLabel.values()

    # Return
    return matrix, grids, chrmLabels
def get3CProfile(CHROMOSOME_HDF5, ANCHOR_CHROMOSOME, ANCHOR, REGION_CHROMOSOME, REGION_START, REGION_END):
	# Read hdf5
	f = h5dict.h5dict(CHROMOSOME_HDF5, mode='r')
	genomeIdxToLabel = f['genomeIdxToLabel']
	chromosomeStarts = f['chromosomeStarts']
        binNumber = f['binNumber']

	# Get chromosome information
	for i in range(len(genomeIdxToLabel)):
		chrmLabel = genomeIdxToLabel[i]
		if chrmLabel == ANCHOR_CHROMOSOME:
			anchor_chrmIdx = i
			anchor_chrmLen = _getChrmLen(i, chromosomeStarts, binNumber)
		if chrmLabel == REGION_CHROMOSOME:
			region_chrmIdx = i
			region_chrmLen = _getChrmLen(i, chromosomeStarts, binNumber)
	
	# Convert coordinates to bin numbers
	resolution = f['resolution']
	anchorBin = ANCHOR / resolution
	if REGION_START != None:
		regionStartBin = REGION_START / resolution
	else:
		regionStartBin = 0
	if REGION_END != None:
		regionEndBin = REGION_END / resolution + 1
	else:
		regionEndBin = region_chrmLen

	# Check bin numbers
	if anchorBin > anchor_chrmLen:
		print '[Error] Anchor coordinate (%s) exceeds chromosome length (%s).' % (ANCHOR, anchor_chrmLen * resolution)
		sys.exit(1)
	if regionEndBin < regionStartBin:
		print '[Error] Region start (%s) is larger than region end (%s).' % (regionStartBin * resolution, regionEndBin * resolution)
		sys.exit(1)
	if regionEndBin > region_chrmLen:
		print '[Error] Region (%s-%s) exceed chromosome length (%s).' % (regionStartBin * resolution, regionEndBin * resolution, region_chrmLen * resolution)
		esys.exit(1)

	# Get matrix
	key = str(anchor_chrmIdx) + ' ' + str(region_chrmIdx)
	matrix = f[key]
	matrix = matrix[anchorBin,regionStartBin:regionEndBin]

	# Name output figure
	FIGURE = CHROMOSOME_HDF5.split('/')[-1][:-5] + '_anchor_chr' + ANCHOR_CHROMOSOME + '_' + str(anchorBin * resolution) + '-' + str((anchorBin + 1) * resolution - 1) + '_region_chr' + REGION_CHROMOSOME + '_' + str(regionStartBin * resolution) + '-' + str(regionEndBin * resolution - 1)

	# Return
	return matrix, anchorBin, regionStartBin, regionEndBin, resolution, FIGURE
예제 #28
0
def step4(hiclib_path, sraid, res=1000000):
    ''' 4. Eigen vector decomposition
    /examples/iterativeCorrectionEigenvectorExpansion/eigenvectorAnalysis.py
    '''
    import matplotlib.pyplot as plt
    import numpy as np
    from mirnylib import genome
    from mirnylib import h5dict
    from mirnylib import plotting
    from hiclib import binnedData

    genome_db = genome.Genome(hiclib_path+'/fasta/hg19', readChrms=['#', 'X'])

    # Read resolution from the dataset.
    raw_heatmap = h5dict.h5dict(sraid+'_map-res%sk.hdf5'%(res/1000), mode='r')  
    resolution = int(raw_heatmap['resolution'])

    # Create a binnedData object, load the data.
    BD = binnedData.binnedData(resolution, genome_db)
    BD.simpleLoad(sraid+'_map-res%sk.hdf5'%(res/1000), 'DataName')
    
    # Do eigen decomposition
    BD.removeDiagonal()
    BD.removeBySequencedCount(0.5)
    BD.removeCis()
    BD.truncTrans(high=0.0005)
    BD.removePoorRegions(cutoff=1)
    BD.fakeCis()
    BD.removeZeros()
    BD.doEig(numPCs=30, force=True) ## First 30 EIGs
    BD.restoreZeros(value=0)

    eig = BD.eigEigenvalueDict['DataName']
    eig_v = BD.EigDict['DataName']

    # Plot the heatmap directly.
    plotting.plot_matrix(np.log(np.dot(np.dot(eig_v.T, np.diag(eig)), eig_v)))
    plt.savefig(sraid+'_map-res%sk-eig.pdf'%(res/1000))
    plt.clf()

    outfile = open(sraid+"_map-res%sk-ic-eig.txt"%(res/1000), "w")
    for i in xrange(len(BD.chromosomeIndex)):
        chro = BD.genome.idx2label[BD.chromosomeIndex[i]]
        posi = BD.positionIndex[i]
        outfile.write("chr%s\t%s\t%s"%(chro, posi, posi+res))
        for eigenvector in eig_v:
            outfile.write("\t%s"%eigenvector[i])
        outfile.write("\n")
    outfile.close()
예제 #29
0
    def merge(self, filenames):

        h5dicts = [h5dict(i, mode = 'r') for i in filenames]
        
        if all(["metadata" in i for i in h5dicts]):
            metadatas = [mydict["metadata"] for mydict in h5dicts]
            # print metadatas
            newMetadata = metadatas.pop()
            for oldData in metadatas:
                for key, value in oldData.items():
                    if (key in newMetadata):
                        newMetadata[key] += value
                    else:
                        log.warning('The key %s can not be found in some files',
                                    key)
            self.metadata = newMetadata
            self.h5dict["metadata"] = self.metadata

        for name in self.vectors.keys():
            res = []
            IfIn = [(name in mydict.keys()) for mydict in h5dicts]
            if not all(IfIn):
                continue
            for mydict in h5dicts:
                res.append(mydict[name])
            res = np.concatenate(res)
            self.N = len(res)
            self.DSnum = self.N
            self._setData(name, res)
            self.h5dict.flush()
            time.sleep(0.2)  # allow buffers to flush
        
        Types = ['LeftType', 'RightType', 'InnerType', 'OuterType']
        check = all([(i in j) for i in Types for j in h5dicts])
        if check:
            LeftType = np.zeros(50, dtype = int)
            RightType = np.zeros(50, dtype = int)
            InnerType = np.zeros(50, dtype = int)
            OuterType = np.zeros(50, dtype = int)
            for mydict in h5dicts:
                LeftType += mydict['LeftType']
                RightType += mydict['RightType']
                InnerType += mydict['InnerType']
                OuterType += mydict['OuterType']
        
            self.h5dict['LeftType'] = LeftType
            self.h5dict['RightType'] = RightType
            self.h5dict['InnerType'] = InnerType
            self.h5dict['OuterType'] = OuterType
예제 #30
0
def showCmap():
    """Shows Hi-C data together with the simulated data. Hi-C data created by hiclib is needed for that,
    but you can replace the line mydict=h5dict()... and the following line with your own data loading code. """

 
    low = 60000 
    high = 75000
    lowMon = low * 1000 // 600 
    highMon = high * 1000 // 600 


    low20 = low // 10
    high20 = high // 10
    # here Hi-C data is loaded for display purposes only..... replace it with your own code if your data is in a different format
    mydict = h5dict("/home/magus/HiC2011/Erez2014/hg19/GM12878_inSitu-all-combined-10k_HighRes.byChr",'r')
    hicdata = mydict.get_dataset("13 13")[low20:high20, low20:high20]

    hicdata = completeIC(hicdata)
    curshape = hicdata.shape 
    newshape = (1000 * (high - low)) // (600 * 5)
    print(hicdata.shape, newshape)
    hicdata = zoomArray(hicdata, (newshape, newshape))
    hicdata = np.clip(hicdata, 0, np.percentile(hicdata, 99.99))
    hicdata /= np.mean(np.sum(hicdata, axis=1))

   #hicdata = hm / np.mean(np.sum(hm, axis=1))

    for fname in os.listdir("cmaps"):

        cmap = pickle.load(open(os.path.join("cmaps", fname), 'rb'))
        #arr = coarsegrain(cmap, 2)
        arr = cmap
        if arr.shape[0] != hicdata.shape[0]:
            continue
        print(arr.shape)


        arr = arr / np.mean(np.sum(arr, axis=1))
        ran = np.arange(len(arr))
        mask = ran[:,None] > ran[None,:]
        arr[mask] = hicdata[mask]

        logarr = np.log(arr + 0.0001)
        # noinspection PyTypeChecker
        plt.imshow(logarr, vmax = np.percentile(logarr, 99.99), vmin = np.percentile(logarr, 10), extent = [low, high, high, low], interpolation = "none")
        plt.savefig(os.path.join("heatmaps", fname+".png"))
        plt.savefig(os.path.join("heatmaps", fname+".pdf"))
        plt.show()
        plt.clf()
예제 #31
0
def extractResolutionFromFileName(fname):
    try:
        raw_heatmap = h5dict.h5dict(fname, mode='r')  #open heatmap
        resolution = int(raw_heatmap['resolution'])  #get the resolution
        del raw_heatmap  #close heatmap
        return resolution
    except:
        try:
            if "/" in fname:
                fname = fname.split("/")[-1]
            res = fname.split("res")[-1].split("k")[0]
            res = int(res) * 1000
            return res
        except:
            print "Warning! Unable to resolve resolution from file name"
            return None
예제 #32
0
    def __init__(self, filename, genome, maximumMoleculeLength = 500,
                 inMemory = False, mode = "a"):
        
        self.vectors = {
            # chromosomes for each read.
            "chrms1": "int8", "chrms2": "int8",

            "mids1": "int32", "mids2": "int32",
            # midpoint of a fragment, determined as "(start+end)/2"

            "fraglens1": "int32", "fraglens2": "int32",
            # fragment lengthes

            "distances": "int32",
            # distance between fragments. If -1, different chromosomes.
            # If -2, different arms.

            "fragids1": "int64", "fragids2": "int64",
            # IDs of fragments. fragIDmult * chromosome + location
            # distance to rsite
            "dists1": "int32", "dists2": "int32",
            # precise location of cut-site
            "cuts1": "int32", "cuts2": "int32",
            "strands1": "bool", "strands2": "bool",
            }
        self.metadata = {}

        #-------Initialization of the genome and parameters-----
        self.mode = mode
        self.genome = genome

        self.chromosomeCount = self.genome.chrmCount
        self.fragIDmult = self.genome.fragIDmult  # used for building heatmaps

        self.maximumMoleculeLength = maximumMoleculeLength

        self.filename = os.path.abspath(os.path.expanduser(filename))  # File to save the data
        self.chunksize = 5000000
        # Chunk size for h5dict operation, external sorting, etc.

        self.inMemory = inMemory

        self.h5dict = h5dict(self.filename, mode = mode, in_memory = inMemory)
        
        if 'chrms1' in self.h5dict.keys():
            chrms1 = self.chrms1
            self.DSnum = self.N = len(chrms1)
예제 #33
0
def getCoverage(filename):
    c = cooler.Cooler(filename)
    if "mm9" in filename:
        gen = "mm9"
    else:
        gen = "hg19"
    mygen = genomDict[gen]

    myd = h5dict(filename, 'r')

    coverages = []
    for mychr in range(mygen.chrmCount):
        data = c.matrix(sparse=True, balance=False).fetch(mygen.idx2label[mychr])
        coverage = np.sum(data, axis=1)
        coverages.append(np.array(coverage)[:,0])
        assert len(coverages[-1]) == data.shape[0]
    return coverages
예제 #34
0
    def __init__(self, genome, resolution, storageFile="inMemory", mode="w"):
        """
        Initializes the high-resolution Hi-C data storage.

        Parameters
        ----------

        genome : folder or Genome object
            matching Genome object or folder to load it form
        resolution : int
            Resolution (number of bp per bin)
        storageFile : str (optional)
            File to store the h5dict.
            File will be created.
            By default stores in memory
        mode : "w", "w-" "r+" or "a", optional
            Access mode to h5dict (see h5dict manual)
        """

        inMemory = (storageFile == "inMemory")

        self._h5dict = h5dict(storageFile, mode=mode, in_memory=inMemory)

        if type(genome) == str:
            genome = Genome(genome, readChrms=["#", "X"])
        assert isinstance(genome, Genome)
        self.genome = genome

        self.resolution = resolution
        self.genome.setResolution(resolution)

        if self.genome.numBins < 7000:
            print "Total number of bins in the genome is just %d" % self.genome.numBins
            warnings.warn(
                "For low-resolution analysis use binnedData, as it provides"
                "more analysis tools")

        M = self.genome.chrmCount
        self.cisKeys = [(i, i) for i in xrange(M)]
        self.transKeys = [(i, j) for i in range(M) for j in range(M) if j > i]
        self.allKeys = self.cisKeys + self.transKeys

        self.data = {}
        self._initChromosomes()
예제 #35
0
def map_reads(first_fq, second_fq, outfile, nice):

    # set the niceness of this sub-process:
    os.nice(nice)

    first_sam = first_fq.split(".fastq.gz")[0] + ".sam"
    second_sam = second_fq.split(".fastq.gz")[0] + ".sam"

    # map the first fastq file -> sam file
    length = check_len(first_fq)
    min_len, step_size = calculate_step(length - seq_skip_start, min_map_len)
    mapping.iterative_mapping(
        bowtie_path=bowtie_path,
        bowtie_index_path=bowtie_index,
        fastq_path=first_fq,
        out_sam_path=os.path.join(args.samdir, first_sam),
        min_seq_len=min_len,
        len_step=step_size,
        seq_start=seq_skip_start,
        nthreads=threads,
        bowtie_flags=bowtie_flags)

    # map the second fastq file -> sam file
    length = check_len(second_fq)
    min_len, step_size = calculate_step(length - seq_skip_start, min_map_len)
    mapping.iterative_mapping(
        bowtie_path=bowtie_path,
        bowtie_index_path=bowtie_index,
        fastq_path=second_fq,
        out_sam_path=os.path.join(args.samdir, second_sam),
        min_seq_len=min_len,
        len_step=step_size,
        seq_start=seq_skip_start,
        nthreads=threads,
        bowtie_flags=bowtie_flags)

    # parse the mapped sequences into a the hdf5 dict structure,
    # assign the ultra-sonic fragments to restriction fragments. <- what the hell does this even mean?
    out_dict = os.path.join(args.samdir, outfile)
    mapped_reads = h5dict.h5dict(out_dict)
    sf1, sf2 = [os.path.join(args.samdir, first_sam), os.path.join(args.samdir, second_sam)]
    mapping.parse_sam(sam_basename1=sf1, sam_basename2=sf2,
                      out_dict=mapped_reads, genome_db=genome_db, save_seqs=False, maxReads=10000000, IDLen=50,
                      enzyme_name='HindIII')
예제 #36
0
    def __init__(self, genome, resolution, storageFile="inMemory", mode="w"):
        """
        Initializes the high-resolution Hi-C data storage.

        Parameters
        ----------

        genome : folder or Genome object
            matching Genome object or folder to load it form
        resolution : int
            Resolution (number of bp per bin)
        storageFile : str (optional)
            File to store the h5dict.
            File will be created.
            By default stores in memory
        mode : "w", "w-" "r+" or "a", optional
            Access mode to h5dict (see h5dict manual)
        """

        inMemory = (storageFile == "inMemory")

        self._h5dict = h5dict(storageFile, mode=mode, in_memory=inMemory)

        if type(genome) == str:
            genome = Genome(genome, readChrms=["#", "X"])
        assert isinstance(genome, Genome)
        self.genome = genome

        self.resolution = resolution
        self.genome.setResolution(resolution)

        if self.genome.numBins < 7000:
            print "Total number of bins in the genome is just %d" % self.genome.numBins
            warnings.warn("For low-resolution analysis use binnedData, as it provides"
                          "more analysis tools")

        M = self.genome.chrmCount
        self.cisKeys = [(i, i) for i in xrange(M)]
        self.transKeys = [(i, j) for i in range(M) for j in range(M) if j > i]
        self.allKeys = self.cisKeys + self.transKeys

        self.data = {}
        self._initChromosomes()
예제 #37
0
    def saveHeatmap(self, filename, resolution, gInfo):

        try:
            os.remove(filename)
        except:
            pass

        tosave = h5dict(path=filename, mode='w')

        heatmap = self.buildAllHeatmap(resolution)

        tosave['heatmap'] = heatmap

        del heatmap

        chromosomeStarts = np.array(self.genome.chrmStartsBinCont)

        tosave['resolution'] = resolution
        tosave['chromosomeStarts'] = chromosomeStarts
        tosave['genomeInformation'] = gInfo
예제 #38
0
def doSaddleError(filename, eig, gen, correct=False):


    gen = Genome("/home/magus/HiC2011/data/" + gen, readChrms=["#", "X"])
    cur = 0
    data = h5dict(filename,'r')["heatmap"]
    if correct:
        data = completeIC(data)
    gen.setResolution(getResolution(filename))
    if eig == "GC":
        eig = np.concatenate(gen.GCBin)
    saddles = []
    permutted = []
    saddle = np.zeros((5,5), dtype = float)
    for i in range(100):
        permutted.append(np.zeros((5,5), dtype = float))

    for chrom in range(gen.chrmCount):
        st = gen.chrmStartsBinCont[chrom]
        end = gen.chrmEndsBinCont[chrom]
        cur = data[st:end, st:end]
        cur = observedOverExpected(cur)
        mask = np.sum(cur , axis=0) > 0
        cur = cur [mask]
        cur = cur [:, mask]
        GC = eig[st:end]
        GC = GC[mask]
        if len(GC) > 5:
            for i in range(5):
                for j in range(5):
                    G1, G2 = np.percentile(GC, [20 * i, 20 * i + 20])
                    mask1 = (GC > G1) * (GC < G2)
                    G1, G2 = np.percentile(GC, [20 * j, 20 * j + 20])
                    mask2 = (GC > G1) * (GC < G2)
                    addition = cur[np.ix_(mask1, mask2)]
                    addition = np.reshape(addition, (-1))
                    for k in range(100):
                        resampled = np.random.choice(addition, len(addition), replace=True)
                        permutted[k][i,j] += resampled.mean()
                    saddle[i, j] += addition.mean()
    return saddle, permutted
예제 #39
0
def iterativeFiltering(genome_db, filesuffix):
    '''
	Filter the data at the binned level and perform the iterative correction.
	'''

    # Read resolution from the dataset.
    raw_heatmap = h5dict.h5dict(options.outputDir + options.experiment +
                                filesuffix,
                                mode='r')
    resolution = int(raw_heatmap['resolution'])

    # Create a binnedData object, load the data.
    BD = binnedData.binnedData(resolution, genome_db)
    BD.simpleLoad(options.outputDir + options.experiment + filesuffix,
                  options.experiment)

    # Remove the contacts between loci located within the same bin.
    BD.removeDiagonal()

    # Remove bins with less than half of a bin sequenced.
    BD.removeBySequencedCount(0.5)

    # Remove 1% of regions with low coverage.
    BD.removePoorRegions(cutoff=1)

    # Truncate top 0.05% of interchromosomal counts (possibly, PCR blowouts).
    BD.truncTrans(high=0.0005)

    # Remove empty bins
    BD.removeZeros()

    # Perform iterative correction.
    BD.iterativeCorrectWithoutSS()

    # Save the iteratively corrected heatmap.
    BD.export(options.experiment,
              options.outputDir + options.experiment + '-IC' + filesuffix)

    plt.figure()
    plotting.plot_matrix(np.log(BD.dataDict[options.experiment]))
    pp.savefig()
예제 #40
0
def directionalityRatio(dataset, size=20):
    heatmap = 1. * h5dict(hm(dataset))["heatmap"]  # extract heatmap

    #filling in the gaps in the heatmap. Not really needed as heatmaps are with overlaps,
    #so they have no gaps
    for _ in range(1):
        zeros = np.sum(heatmap, axis=0) == 0
        zeros = np.nonzero(zeros)[0]
        heatmap[zeros] = heatmap[zeros - 1]
        heatmap[:, zeros] = heatmap[:, zeros - 1]
    #Following regular IC protocol (see 033_....py)
    mirnylib.numutils.fillDiagonal(heatmap, 0, 0)
    mirnylib.numutils.fillDiagonal(heatmap, 0, 1)
    mirnylib.numutils.fillDiagonal(heatmap, 0, -1)
    heatmap = trunc(heatmap, low=0, high=0.0001)
    heatmap = ultracorrect(heatmap)
    diag2value = np.mean(np.diagonal(heatmap, 2))
    mirnylib.numutils.fillDiagonal(heatmap, 1.5 * diag2value, 0)
    mirnylib.numutils.fillDiagonal(heatmap, 1.2 * diag2value, 1)
    mirnylib.numutils.fillDiagonal(heatmap, 1.2 * diag2value, -1)
    heatmap /= np.mean(np.sum(heatmap, axis=0))

    #Put 9 copies of the heatmap in a huge square - Caulobacter is a ring.
    #this is a cheap-and-dirty way to account for that
    tiledHeatmap = np.hstack([heatmap, heatmap, heatmap])
    tiledHeatmap = np.vstack([tiledHeatmap, tiledHeatmap, tiledHeatmap])
    setExceptionHook()  # debug only
    start = len(heatmap)
    end = 2 * len(heatmap)
    ratios = []
    for mon in xrange(start, end):  #going through the central square
        upstream = tiledHeatmap[mon, mon:mon + size].sum()
        downstream = tiledHeatmap[mon - size:mon, mon].sum()
        #print upstream
        #print downstream
        ratios.append(
            upstream /
            (upstream + downstream))  #this is upstream/downstream ratio

    return ratios
예제 #41
0
    def saveHeatmap(self, filename, resolution, countDiagonalReads = 'Once'):

        try:
            os.remove(filename)
        except:
            pass

        tosave = h5dict(path = filename, mode = 'w')
        
        heatmap = self.buildAllHeatmap(resolution, countDiagonalReads)

        tosave['heatmap'] = heatmap
        
        del heatmap
        
        chromosomeStarts = np.array(self.genome.chrmStartsBinCont)
        numBins = self.genome.numBins
            
        tosave['resolution'] = resolution
        tosave['genomeBinNum'] = numBins
        tosave['genomeIdxToLabel'] = self.genome.idx2label
        tosave['chromosomeStarts'] = chromosomeStarts
예제 #42
0
def directionalityRatio(dataset, size=20):
    heatmap = 1. * h5dict(hm(dataset))["heatmap"]  # extract heatmap

    #filling in the gaps in the heatmap. Not really needed as heatmaps are with overlaps,
    #so they have no gaps
    for _ in range(1):
        zeros = np.sum(heatmap, axis=0) == 0
        zeros = np.nonzero(zeros)[0]
        heatmap[zeros] = heatmap[zeros - 1]
        heatmap[:, zeros] = heatmap[:, zeros - 1]
    #Following regular IC protocol (see 033_....py)
    mirnylib.numutils.fillDiagonal(heatmap, 0, 0)
    mirnylib.numutils.fillDiagonal(heatmap, 0, 1)
    mirnylib.numutils.fillDiagonal(heatmap, 0, -1)
    heatmap = trunc(heatmap, low=0, high=0.0001)
    heatmap = ultracorrect(heatmap)
    diag2value = np.mean(np.diagonal(heatmap, 2))
    mirnylib.numutils.fillDiagonal(heatmap, 1.5 * diag2value, 0)
    mirnylib.numutils.fillDiagonal(heatmap, 1.2 * diag2value, 1)
    mirnylib.numutils.fillDiagonal(heatmap, 1.2 * diag2value, -1)
    heatmap /= np.mean(np.sum(heatmap, axis=0))

    #Put 9 copies of the heatmap in a huge square - Caulobacter is a ring.
    #this is a cheap-and-dirty way to account for that
    tiledHeatmap = np.hstack([heatmap, heatmap, heatmap])
    tiledHeatmap = np.vstack([tiledHeatmap, tiledHeatmap, tiledHeatmap])
    setExceptionHook()  # debug only
    start = len(heatmap)
    end = 2 * len(heatmap)
    ratios = []
    for mon in xrange(start, end):  #going through the central square
        upstream = tiledHeatmap[mon, mon:mon + size].sum()
        downstream = tiledHeatmap[mon - size:mon, mon].sum()
        #print upstream
        #print downstream
        ratios.append(upstream / (upstream + downstream))  #this is upstream/downstream ratio

    return ratios
예제 #43
0
 def getNumReads(self):
     hd = h5dict(self.refined, 'r')
     return len(hd.get_dataset("strands1"))
예제 #44
0
        temp_dir='tmp',  # optional, keep temporary files here
        bowtie_flags='--very-sensitive')

    mapping.iterative_mapping(
        bowtie_path=bowtiePath,
        bowtie_index_path=bowtieIndex,
        fastq_path=file2,
        out_sam_path='sams/%s_2.bam' % expName,
        min_seq_len=10,
        len_step=3,
        seq_start=0,
        seq_end=40,
        nthreads=4,  # on intel corei7 CPUs 4 threads are as fast as
                     # 8, but leave some room for you other applications
        #max_reads_per_chunk = 10000000,  #optional, on low-memory machines
        temp_dir='tmp',  # optional, keep temporary files here
        bowtie_flags='--very-sensitive')

    # B. Parse the mapped sequences into a Python data structure,
    #    assign the ultra-sonic fragments to restriction fragments.
    mapped_reads = h5dict.h5dict('caul/%s' % expName)
    genome_db    = genome.Genome('../data/caul', chrmFileTemplate="%s.fa", readChrms=[])

    mapping.parse_sam(
        sam_basename1='sams/%s_1.bam' % expName,
        sam_basename2='sams/%s_2.bam' % expName,
        out_dict=mapped_reads,
        genome_db=genome_db, 
        enzyme_name='BglII')

예제 #45
0
 def export(self, filename):
     mydict = h5dict(filename)
     for i in self.allKeys:
         data = self.data[i].getData()
         mydict["%d %d" % i] = data
     mydict["resolution"] = self.resolution
예제 #46
0
    def parseInputData(self, dictLike, **kwargs):
        
        import numexpr
        
        if not os.path.exists(dictLike):
            raise IOError('File not found: %s' % dictLike)
        
        dictLike = h5dict(dictLike, 'r')
        
        self.chrms1 = dictLike['chrms1']
        self.chrms2 = dictLike['chrms2']
        self.cuts1 = dictLike['cuts1']
        self.cuts2 = dictLike['cuts2']
        self.strands1 = dictLike['strands1']
        self.strands2 = dictLike['strands2']
        self.dists1 = np.abs(dictLike['rsites1'] - self.cuts1)
        self.dists2 = np.abs(dictLike['rsites2'] - self.cuts2)
        self.mids1 = (dictLike['uprsites1'] + dictLike['downrsites1']) / 2
        self.mids2 = (dictLike['uprsites2'] + dictLike['downrsites2']) / 2
        self.fraglens1 = np.abs(
            (dictLike['uprsites1'] - dictLike['downrsites1']))
        self.fraglens2 = np.abs(
            (dictLike['uprsites2'] - dictLike['downrsites2']))
        self.fragids1 = self.mids1 + np.array(self.chrms1,
                                              dtype='int64') * self.fragIDmult
        self.fragids2 = self.mids2 + np.array(self.chrms2,
                                              dtype='int64') * self.fragIDmult
        
        distances = np.abs(self.mids1 - self.mids2)
        distances[self.chrms1 != self.chrms2] = -1
        self.distances = distances  # Distances between restriction fragments
        del distances
        
        self.N = len(self.chrms1)

        try:
            dictLike['misc']['genome']['idx2label']
            self.updateGenome(self.genome,
                              oldGenome = dictLike["misc"]["genome"]["idx2label"])
        except KeyError:
            assumedGenome = Genome(self.genome.genomePath)
            self.updateGenome(self.genome, oldGenome = assumedGenome)

        # Discard dangling ends and self-circles
        DSmask = (self.chrms1 >= 0) * (self.chrms2 >= 0)
        self.metadata['100_NormalPairs'] = DSmask.sum()

        sameFragMask = self.evaluate("a = (fragids1 == fragids2)",
                     ["fragids1", "fragids2"]) * DSmask

        cutDifs = self.cuts2[sameFragMask] > self.cuts1[sameFragMask]
        s1 = self.strands1[sameFragMask]
        s2 = self.strands2[sameFragMask]
        SSDE = (s1 != s2)
        SS = SSDE * (cutDifs == s2)
        SS_N = SS.sum()
        SSDE_N = SSDE.sum()
        sameFrag_N = sameFragMask.sum()
        self.metadata['120_SameFragmentReads'] = sameFrag_N
        self.metadata['122_SelfLigationReads'] = SS_N
        self.metadata['124_DanglingReads'] = SSDE_N - SS_N
        self.metadata['126_UnknownMechanism'] = sameFrag_N - SSDE_N
        
        mask = DSmask * (-sameFragMask)

        del DSmask, sameFragMask
        
        noSameFrag = mask.sum()
        
        # distance between sites facing each other
        dist = self.evaluate("a = numexpr.evaluate('- cuts1 * (2 * strands1 -1) - "
                             "cuts2 * (2 * strands2 - 1)')",
                             ["cuts1", "cuts2", "strands1", "strands2"],
                             constants={"numexpr":numexpr})

        readsMolecules = self.evaluate(
            "a = numexpr.evaluate('(chrms1 == chrms2) & (strands1 != strands2) &  (dist >=0) &"
            " (dist <= maximumMoleculeLength)')",
            internalVariables=["chrms1", "chrms2", "strands1", "strands2"],
            externalVariables={"dist": dist},
            constants={"maximumMoleculeLength": self.maximumMoleculeLength, "numexpr": numexpr})

        mask *= (readsMolecules == False)
        extraDE = mask.sum()
        self.metadata['210_ExtraDanglingReads'] = -extraDE + noSameFrag
        if mask.sum() == 0:
            raise Exception('No reads left after filtering. Please, check the input data')

        del dist, readsMolecules
        
        self.maskFilter(mask)
예제 #47
0
readChrms = ["#",  # read all numbered chromosomes
             "X"]   # add X chromosome

for inDataset in inDatasets.values():
    if not os.path.exists(inDataset):
        raise IOError("Raw heatmap file does not exist: {}".format(inDataset))

if not os.path.isdir(genomeFolder):
    raise IOError("Genome folder does not exist")

# When you do this, be sure that readChrms used to save heatmap matches
# readChrms that you define here!
genome = Genome(genomeFolder, readChrms=readChrms)

# Read resolution from one of the datasets
sampleDataset = h5dict(inDatasets.values()[0], mode="r")  # random dataset
resolution = int(sampleDataset["resolution"])

# Define the binnedData object, load data
BD = binnedData(resolution, genome, readChrms)
for name, filename in inDatasets.items():
    BD.simpleLoad(filename, name)

BD.removeDiagonal()

# Remove bins with less than half of a bin sequenced
BD.removeBySequencedCount(0.5)

# Remove 1% of regions with low coverage
BD.removePoorRegions(cutoff=1)
예제 #48
0
parser.add_argument("-d", "--datafile", help="the dataset file to be output (default name is datasets.tsv, same dir as runs file)")
args = parser.parse_args()

# open and parse the runs file
runs_file = open(os.path.join(args.basedir,args.runsfile),"r")
runs = [run.split() for run in runs_file.readlines() if not run.startswith("#")]
runs_file.close()

# process each record in the runs file, write out to the data sets file
datasets_file = open(os.path.join(args.basedir,args.datafile),"w")

# print header for datasets file
datasets_file.write("# The file has the following structure:\n")
datasets_file.write("# Filename\tExperiment\tReplicate\tGenome\tRestrictionEnzyme\n")

for run in runs:
    input_dir, experiment, replicate, genome, restriction_enzyme = run
    filenames = [j for j in os.listdir(os.path.join(args.basedir,input_dir)) if j.endswith(".hdf5") ]
    for fname in filenames:
        try:
            mydict = h5dict(os.path.join(args.basedir,input_dir,fname),'r')
        except:
            pass
        if "strands1" not in mydict:
            raise
        if len(mydict.get_dataset("strands1")) < 10000:
            raise
        datasets_file.write("{0}\t{1}\t{2}\t{3}\t{4}\n".format(os.path.join(input_dir,fname),experiment,
                                                    replicate, genome, restriction_enzyme))

datasets_file.close()
예제 #49
0
def showAllDatasets():
    setExceptionHook()

    #plt.figure(figsize=(25, 15))
    fig = plt.figure()

    #size of the figure
    fw = fig.get_figwidth() * fig.get_dpi()
    fh = fig.get_figheight() * fig.get_dpi()

    #get subplot configuration
    sx, sy = subplots(len(datasets))

    for  j, dataset in enumerate(datasets):
        curPlot = plt.subplot(sx, sy, j + 1)
        heatmap = 1. * h5dict(hm(dataset), 'r')["heatmap"]

        #fill in gaps - obsolete, as heatmaps are with overlaps
        for _ in range(1):
            zeros = np.sum(heatmap, axis=0) == 0
            zeros = np.nonzero(zeros)[0]
            heatmap[zeros] = heatmap[zeros - 1]
            heatmap[:, zeros] = heatmap[:, zeros - 1]

        #regular IC protocol
        mirnylib.numutils.fillDiagonal(heatmap, 0, 0)
        mirnylib.numutils.fillDiagonal(heatmap, 0, 1)
        mirnylib.numutils.fillDiagonal(heatmap, 0, -1)
        heatmap = trunc(heatmap, low=0, high=0.0001)
        heatmap = ultracorrect(heatmap)
        diag2value = np.mean(np.diagonal(heatmap, 2))
        mirnylib.numutils.fillDiagonal(heatmap, 1.5 * diag2value, 0)
        mirnylib.numutils.fillDiagonal(heatmap, 1.2 * diag2value, 1)
        mirnylib.numutils.fillDiagonal(heatmap, 1.2 * diag2value, -1)
        newHeatmap = heatmap

        #Top highly expressed genes
        #genePos = [18, 56, 77, 117, 143, 215, 234, 256, 266, 286, 300, 326, 336, 367, 379]
        geneCoor = [1162773, 3509071, 1180887, 543099, 1953250, 2522439, 3328524, 1503879, 900483, 242693, 3677144, 3931680, 3677704, 3762707, 3480870, 3829656, 1424678, 901855, 1439056, 3678537]

        # here we commited to 10kb resolution - change below if you're not
        genePos = [i / 10000. for i in geneCoor]

        genePos = []

        #putting lines at highly expressed genes
        for lpos in genePos:
            plt.hlines(lpos , 0, 500, linewidth=0.7, color="black", alpha=0.2, zorder=1)
            plt.vlines(lpos , 0, 500, linewidth=0.7, color="black", alpha=0.2, zorder=1)
            pass

        #performing adaptive smoothing
        smoothedHeatmap = adaptiveSmoothing(newHeatmap, 20)
        smoothedHeatmap /= np.mean(np.sum(heatmap, axis=0))

        #print dataset, sum([np.diagonal(smoothedHeatmap, i).sum() for i in range(60, 140)])
        #maps = [[smoothedHeatmap, smoothedHeatmap[:30]],
        #         [smoothedHeatmap[:, :30], smoothedHeatmap[:30, :30]]]
        #smoothedHeatmap = np.hstack([np.vstack(i) for i in maps])

        allx = []
        ally = []

        plt.title(dataset, fontsize=10)
        plt.imshow((smoothedHeatmap), interpolation="none", vmax=0.035, cmap="acidblues", zorder=0)
        #plt.imshow((smoothedHeatmap), interpolation="nearest", vmin=0, vmax=np.exp(-4.5), cmap="fall", zorder=0)
        plt.xticks([])
        plt.yticks([])





        plt.subplots_adjust(left=0.05,  # the left side of the subplots of the figure
      right=0.95,  # the right side of the subplots of the figure
      bottom=0.05,  # the bottom of the subplots of the figure
      top=0.95 ,  # the top of the subplots of the figure
      wspace=0.1,  # the amount of width reserved for blank space between subplots
      hspace=0.2)
        #cPickle.dump(scaling, open(dataset.split("/")[-1] + "scaling", 'w'))
        #plt.ylim((400, 200))
        #plt.xlim((0, 200))

        #code below just puts the P(s) over the heatmap
        N = len(smoothedHeatmap)
        pts = np.array([[1, 0], [N, N], [N, 0]])
        p = Polygon(pts, closed=True, facecolor=(0.8, 0.8, 0.8), linewidth=0, alpha=0.7, zorder=2)
        ax = plt.gca()
        ax.add_patch(p)

        Bbox = matplotlib.transforms.Bbox.from_bounds(.55, .55, .35, .42)
        tBbox = matplotlib.transforms.TransformedBbox(Bbox, ax.transAxes).get_points()
        l, b, w, h = tBbox[0, 0] / fw, tBbox[0, 1] / fh, (tBbox[1, 0] - tBbox[0, 0]) / fw, (tBbox[1, 1] - tBbox[0, 1]) / fh
        axins = fig.add_axes([l, b, w, h], axisbg=(0, 0, 0, 0), xscale="log", yscale="log")
        removeAxes(ax=axins)
        for xlabel_i in axins.get_xticklabels(): xlabel_i.set_fontsize(6)
        for xlabel_i in axins.get_yticklabels(): xlabel_i.set_fontsize(6)

        N = len(smoothedHeatmap)
        st = int(0.05 * N)
        end = int(0.45 * N)
        st2 = int(0.55 * N)
        end2 = int(0.95 * N)
        axins.plot(*scaling(0.5 * (smoothedHeatmap[st:end, st:end] + smoothedHeatmap[st2:end2, st2:end2])), color="blue", label="intra-arm")
        if (dataset in ['Wildtype_0min_BglII_rep1', "ML2000_0hr"]):
            myscaling = scaling(0.5 * (smoothedHeatmap[st:end, st:end] + smoothedHeatmap[st2:end2, st2:end2]))
        #axins.plot(*scaling(smoothedHeatmap[st:end, end2:st2:-1]), color="green", label="inter-arm")
        axins.set_xlabel("kb", fontsize=6)
        axins.set_ylabel("Pc", fontsize=6)
        axins.grid()

        if "myscaling" in locals():
            axins.plot(*myscaling, color="grey")

        #axins.set_xticks([])
        #axins.set_yticks([])
        #axins.tick_params(color="red")

        #axins.set_xlabel("Mb")
        #axins.set_ylabel("Pc")
        for i, line in enumerate(axins.get_xticklines() + axins.get_yticklines()):
            if i % 2 == 1:  # odd indices
                line.set_visible(False)

        #if dataset != "Wildtype_0min_BglII_rep1":
        #    data = cPickle.load(open("scalings/{0}".format(dataset)))
        #    axins.plot(*data, color="blue")

        #axins.xscale("log")
        #axins.yscale("log")

        #end strange code





    plt.show()
예제 #50
0
#!/usr/bin/env python

import sys

from hiclib import mapping, fragmentHiC
from mirnylib import h5dict, genome

basedir = sys.argv[1]

mapped_reads1 = h5dict.h5dict('%s/Data/Timing/mapped_reads1.hdf5' % basedir)
mapped_reads2 = h5dict.h5dict('%s/Data/Timing/mapped_reads2.hdf5' % basedir)
mapped_reads3 = h5dict.h5dict('%s/Data/Timing/mapped_reads3.hdf5' % basedir)
genome_db    = genome.Genome('%s/Data/Genome/mm9_fasta' % basedir, readChrms=['1'], chrmFileTemplate="%s.fa")

mapping.parse_sam(
    sam_basename1='%s/Data/Timing/SRR443886_sub_1.bam' % basedir,
    sam_basename2='%s/Data/Timing/SRR443886_sub_2.bam' % basedir,
    out_dict=mapped_reads1,
    genome_db=genome_db, 
    enzyme_name='NcoI')

mapping.parse_sam(
    sam_basename1='%s/Data/Timing/SRR443887_sub_1.bam' % basedir,
    sam_basename2='%s/Data/Timing/SRR443887_sub_2.bam' % basedir,
    out_dict=mapped_reads2,
    genome_db=genome_db, 
    enzyme_name='NcoI')

mapping.parse_sam(
    sam_basename1='%s/Data/Timing/SRR443888_sub_1.bam' % basedir,
    sam_basename2='%s/Data/Timing/SRR443888_sub_2.bam' % basedir,
예제 #51
0
from mirnylib.h5dict import h5dict
import numpy as np
import sys
import os

genome = Genome(sys.argv[1], readChrms=["1", "2", "3", "4", "5"])

a = HiResHiC(genome, 1000000, "hiResDict", mode='w')
a.loadData(dictLike="../fragmentHiC/test-1M-byChr.hm")
a.removeDiagonal()
a.removePoorRegions(2)
a.iterativeCorrection(1e-10)

b = binnedData(1000000, genome)

data = {"heatmap": h5dict("../fragmentHiC/test-1M.hm")["heatmap"]}
lim = b.genome.chrmEndsBinCont[-1]
data["heatmap"] = data["heatmap"][:lim, :lim]

b.simpleLoad(data, "data")
b.removeDiagonal()
b.removePoorRegions(cutoff=2)
b.iterativeCorrectWithoutSS(tolerance=1e-10)
a.export("testExport")

def compareData():
    dataHigh = a.getCombinedMatrix()
    dataLow = b.dataDict["data"]

    dataHigh /= dataHigh.mean()
    dataLow /= dataLow.mean()
예제 #52
0
    print "Checking for numpy version..",
    try:
        nv = numpy.__version__
        nums = tuple([int(i) for i in nv.split('.')[:2]])
        assert nums >= (1, 6)
        print "Correct!"
    except:
        print "numpy version is %s" % nv
        print "Needs at least numpy 1.6"
        print "See manual for numpy installation guide"
        raise RuntimeError("Wrong numpy version")

    print "Checking for mirnylib.h5dict install..",
    from mirnylib.h5dict import h5dict
    a = h5dict()
    b = numpy.empty(1000000, dtype="int16")
    c = "bla bla bla"
    a["numpy"] = b
    a["object"] = c
    assert (a["numpy"] - b).sum() == 0
    print "H5dict test successful!"

    print "Checking for joblib..",
    try:
        import joblib
        print "Found!"
    except:
        print "joblib not found"
        raise RuntimeError("joblib not found")
예제 #53
0
#!/usr/bin/env python

import sys
import os

from hiclib import mapping, fragmentHiC
from mirnylib import h5dict, genome

fasta_dir, re_name, out_fname, in_dir = sys.argv[1:5]
in_prefices = sys.argv[5:]
basedir = os.path.split(os.path.abspath(out_fname))[0]

mapped_reads = []
for prefix in in_prefices:
    mapped_reads.append(h5dict.h5dict('%s/%s.hdf5' % (basedir, prefix)))
genome_db = genome.Genome(fasta_dir, readChrms=['#', 'X'], chrmFileTemplate="%s.fa")

for i, name in enumerate(mapped_reads):
    mapping.parse_sam(
        sam_basename1="%s/%s_1.bam" % (in_dir, in_prefices[i]),
        sam_basename2="%s/%s_2.bam" % (in_dir, in_prefices[i]),
        out_dict=name,
        genome_db=genome_db, 
        enzyme_name=re_name)

for i, name in enumerate(mapped_reads):
    fragments = fragmentHiC.HiCdataset(
        filename='temp',
        genome=genome_db,
        maximumMoleculeLength=500,
        mode='w',
예제 #54
0
    def saveByChromosomeHeatmap(self, filename, resolution = 40000,
                                includeTrans = False,
                                countDiagonalReads = "Once"):
        """
        Saves chromosome by chromosome heatmaps to h5dict.
        
        This method is not as memory demanding as saving all x all heatmap.

        Keys of the h5dict are of the format ["1 1"], where chromosomes are
        zero-based, and there is one space between numbers.

        Parameters
        ----------
        filename : str
            Filename of the h5dict with the output
            
        resolution : int
            Resolution to save heatmaps
            
        includeTrans : bool, optional
            Build inter-chromosomal heatmaps (default: False)
            
        countDiagonalReads : "once" or "twice"
            How many times to count reads in the diagonal bin

        """
        if countDiagonalReads.lower() not in ["once", "twice"]:
            raise ValueError("Bad value for countDiagonalReads")
            
        self.genome.setResolution(resolution)
        
        pos1 = self.evaluate("a = np.array(mids1 / {res}, dtype = 'int32')"
                             .format(res=resolution), "mids1")
        pos2 = self.evaluate("a = np.array(mids2 / {res}, dtype = 'int32')"
                             .format(res=resolution), "mids2")
                             
        chr1 = self.chrms1
        chr2 = self.chrms2
        
        # DS = self.DS  # 13 bytes per read up to now, 16 total
        mydict = h5dict(filename)

        for chrom in xrange(self.genome.chrmCount):
            if includeTrans == True:
                mask = ((chr1 == chrom) + (chr2 == chrom))
            else:
                mask = ((chr1 == chrom) * (chr2 == chrom))
            # Located chromosomes and positions of chromosomes
            c1, c2, p1, p2 = chr1[mask], chr2[mask], pos1[mask], pos2[mask]
            if includeTrans == True:
                # moving different chromosomes to c2
                # c1 == chrom now
                mask = (c2 == chrom) * (c1 != chrom)
                c1[mask], c2[mask], p1[mask], p2[mask] = c2[mask].copy(), c1[
                    mask].copy(), p2[mask].copy(), p1[mask].copy()
                del c1  # ignore c1
                args = np.argsort(c2)
                c2 = c2[args]
                p1 = p1[args]
                p2 = p2[args]

            for chrom2 in xrange(chrom, self.genome.chrmCount):
                if (includeTrans == False) and (chrom2 != chrom):
                    continue
                start = np.searchsorted(c2, chrom2, "left")
                end = np.searchsorted(c2, chrom2, "right")
                cur1 = p1[start:end]
                cur2 = p2[start:end]
                label = np.asarray(cur1, "int64")
                label *= self.genome.chrmLensBin[chrom2]
                label += cur2
                maxLabel = self.genome.chrmLensBin[chrom] * \
                           self.genome.chrmLensBin[chrom2]
                counts = np.bincount(label, minlength = maxLabel)
                assert len(counts) == maxLabel
                mymap = counts.reshape((self.genome.chrmLensBin[chrom], -1))
                if chrom == chrom2:
                    mymap = mymap + mymap.T
                    if countDiagonalReads.lower() == "once":
                        fillDiagonal(mymap, np.diag(mymap).copy() / 2)
                mydict["%d %d" % (chrom, chrom2)] = mymap
        
        mydict['resolution'] = resolution

        return
예제 #55
0
def refine_paper(filename, create=True):
    """filename[0] is a list of filenames of incoming files
    filename[1] is a folder for outgoing file"""
    if create == True:
        for onename in filename[0]:
            #Parsing individual files
            if not os.path.exists(onename):
                raise StandardError("path not found: %s" % onename)
            TR = HiCdataset("bla", genome=genomeFolder, enzymeName="HindIII",maximumMoleculeLength=500, inMemory=True)
            print "\nTesting loading new data without rsite information    "
            TR.parseInputData(dictLike=onename,
                              enzymeToFillRsites="HindIII")
            #assert len(TR.DS) == 856143

            #assert len(TR.ufragments) == 634572
            TR.save(onename + "_parsed.frag")

        #Merging files alltogether, applying filters
        TR = HiCdataset(filename[1] + "_merged.frag",enzymeName = "HindIII",
                        genome=genomeFolder, mode="w")
        TR.merge([i + "_parsed.frag" for i in filename[0]])

        TR = HiCdataset("refined", genome=genomeFolder,enzymeName = "HindIII",
                        mode="w", inMemory=True)

        print "\nTesting chunking during all tests"
        TR.chunksize = 30000
        #because we do many operations, we disable autoFlush here
        TR.load(filename[1] + "_merged.frag")

        print "\nTesting Rsite filter"
        TR.filterRsiteStart(offset=5)

        #assert len(TR.DS) == 832110

        print "\nTesting duplicate filter"
        TR.filterDuplicates(chunkSize = 30000)        

        #assert len(TR.DS) == 830275

        print "\nTesting small/large and extreme fragment filter"
        TR.filterLarge()

        #assert len(TR.DS) == 825442
        TR.filterExtreme(cutH=0.005, cutL=0)
        TR.writeFilteringStats()

        #assert len(TR.DS) == 803845


    #-------------------------------------------
    TR.printMetadata(saveTo="metadata")
    import cPickle

    stop = False
    mdata = cPickle.load(open("sampleMetadata"))
    for i in sorted(mdata.keys()):
        if TR.metadata[i] != mdata[i]:
            print "Key {0} is not consistent: should be {1}, is {2}".format(i, mdata[i], TR.metadata[i])
            stop = True
    if stop == True:
        print ("""------------_ERROR_--------------
        Inconsistent metadata: see above
        ----------------------------------------""")
        raise ValueError("Inconsistent Metadata")


    print "Testing allxall and by-chromosome heatmap counting diagonal twice"

    print "----> saving allxall heatmap"
    TR.saveHeatmap(filename[1] + "-1M.hm", 1000000,
                   countDiagonalReads="twice")
    a = h5dict(filename[1] + "-1M.hm")
    st, end = TR.genome.chrmStartsBinCont[1], TR.genome.chrmEndsBinCont[1]
    st2, end2 = TR.genome.chrmStartsBinCont[2], TR.genome.chrmEndsBinCont[2]
    chrom1 = a["heatmap"][st:end, st:end]
    chrom12 = a["heatmap"][st:end, st2:end2]
    setExceptionHook()
    print "----> saving by chromosome heatmap"
    TR.saveByChromosomeHeatmap(
        filename[1] + "-1M.hm", resolution=1000000, includeTrans=True,
        countDiagonalReads="twice")

    b = h5dict(filename[1] + "-1M.hm")["1 1"]
    bb = h5dict(filename[1] + "-1M.hm")["1 2"]
    assert (b - chrom1).sum() == 0
    print "Cis heatmap consistent"
    assert (bb - chrom12).sum() == 0
    print 'Trans heatmap consistent'
    print  a["heatmap"][::10, ::10].sum()
    #assert  a["heatmap"][::10, ::10].sum() == 21800
    print "Heatmap sum correct\n"

    #---------------------------------
    print "Testing allxall and by-chromosome heatmap counting diagonal once"

    TR.saveHeatmap(filename[1] + "-1M.hm", 1000000,
                   countDiagonalReads="once")
    Ta = h5dict(filename[1] + "-1M.hm")
    st, end = TR.genome.chrmStartsBinCont[1], TR.genome.chrmEndsBinCont[1]
    st2, end2 = TR.genome.chrmStartsBinCont[2], TR.genome.chrmEndsBinCont[2]
    chrom1 = Ta["heatmap"][st:end, st:end]
    chrom12 = Ta["heatmap"][st:end, st2:end2]
    setExceptionHook()
    print "----> saving by chromosome heatmap"
    TR.saveByChromosomeHeatmap(
        filename[1] + "-1M-byChr.hm", resolution=1000000, includeTrans=True,
        countDiagonalReads="once")
    
    TR.saveHiResHeatmapWithOverlaps(filename[1]+"-1M-highRes.hm", resolution=50000, countDiagonalReads="twice")
    TR.saveSuperHighResMapWithOverlaps(filename[1]+"-5k-SuperHighRes.hm", resolution=5000,chromosomes = [14], countDiagonalReads="twice")

    Tb = h5dict(filename[1] + "-1M-byChr.hm")["1 1"]
    Tbb = h5dict(filename[1] + "-1M-byChr.hm")["1 2"]
    assert ((Tb - chrom1) == 0).all()
    assert ((Tbb - chrom12) == 0).all()
    assert ((Tb + np.diag(np.diag(Tb))) == b).all()
    print "Diagonal counting methods are consistent\n"
    
    newchrom1 = chrom1.copy()
    for i in xrange(len(newchrom1)):
        newchrom1[i,i] = 2 * newchrom1[i,i]
    
    Tb = h5dict(filename[1] + "-1M-highRes.hm")["1 1"]
    assert np.abs(Tb.sum() - newchrom1.sum()) < 1
    assert np.sum(np.abs(coarsegrain(Tb,20,True) - newchrom1)) < 500
    

    #------------------------------
    print "Testing updateGenome method"
    from mirnylib.genome import Genome
    removeChromIDs = np.array([0, 1, 1, 1, 1] + [0] * 17 + [1] + [0])
    #print ((removeChromIDs[TR.chrms1] == 1) + (removeChromIDs[TR.chrms2] == 1) ).sum()
    t = ((removeChromIDs[TR.chrms1] == 1) * (removeChromIDs[TR.chrms2] == 1)).sum() + ((removeChromIDs[TR.chrms1] == 1) * (TR.chrms2 == -1)).sum()
    newGenome = Genome(genomePath=genomeFolder, readChrms=["2",
                                                           "3", "4", "5", "X"])
    TR.updateGenome(newGenome)
    assert  TR.N == t

    a = h5dict(filename[1] + "-1M.hm")["heatmap"]
예제 #56
0
def toSparse(source, idx2label, csr = False):
    """
    Convert intra-chromosomal contact matrices to sparse ones.
    
    Parameters
    ----------
    source : str
         Hdf5 file name.
    
    idx2label : dict
        A dictionary for conversion between zero-based indices and
        string chromosome labels.
    
    csr : bool
        Whether to use CSR (Compressed Row Storage) format or not.
    
    """
    import zipfile, tempfile
    from numpy.lib.format import write_array
    from scipy import sparse
    
    lib = h5dict(source, mode = 'r')
    
    ## Uniform numpy-structured-array format
    itype = np.dtype({'names':['bin1', 'bin2', 'IF'],
                          'formats':[np.int, np.int, np.float]})
    
    ## Create a Zip file in NPZ case
    if not csr:
        output = source.replace('.hm', '-sparse.npz')
    else:
        output = source.replace('.hm', '-csrsparse.npz')
    
    Zip = zipfile.ZipFile(output, mode = 'w', allowZip64 = True)
    fd, tmpfile = tempfile.mkstemp(suffix = '-numpy.npy')
    os.close(fd)
    
    log.log(21, 'Sparse Matrices will be saved to %s', output)
    log.log(21, 'Only intra-chromosomal matrices will be taken into account')
    log.log(21, 'Coverting ...')
    
    count = 0
    
    for i in lib:
        if (i != 'resolution') and (len(set(i.split())) == 1):
            # Used for the dict-like key
            key = idx2label[int(i.split()[0])]
            
            log.log(21, 'Chromosome %s ...', key)
            # 2D-Matrix
            H = lib[i]
            
            if not csr:
                # Triangle Array
                Triu = np.triu(H)
                # Sparse Matrix in Memory
                x, y = np.nonzero(Triu)
                values = Triu[x, y]
                temp = np.zeros(values.size, dtype = itype)
                temp['bin1'] = x
                temp['bin2'] = y
                temp['IF'] = values
            else:
                temp = sparse.triu(H, format = 'csr')
            
            fname = key + '.npy'
            fid = open(tmpfile, 'wb')
            try:
                write_array(fid, np.asanyarray(temp))
                fid.close()
                fid = None
                Zip.write(tmpfile, arcname = fname)
            finally:
                if fid:
                    fid.close()
                    
            log.log(21, 'Done!')
            
            count += 1
            
    # Store the resolution information
    if 'resolution' in lib:
        fname = 'resolution.npy'
        fid = open(tmpfile, 'wb')
        try:
            write_array(fid, np.asanyarray(lib['resolution']))
            fid.close()
            fid = None
            Zip.write(tmpfile, arcname = fname)
        finally:
            if fid:
                fid.close()
    
    if count == 0:
        log.warning('Empty source file!')
    
    os.remove(tmpfile)
    
    Zip.close()