Пример #1
0
 def prepare_for_PCA(self, data, boundaries=None, iterations=3):
     if boundaries is None:
         boundaries = self.boundaries
     from mirnylib.numutils import completeIC
     for i in range(iterations):
         newdata = self.substitute_intra_by_inter(data, boundaries)
         newdata = completeIC(newdata)
     return newdata
Пример #2
0
    def iterativeCorrection(self, outname):

        mydict = h5dict(outname)
        for key in self.cisKeys:
            bychr = self.rawdata[key]
            corrected = completeIC(bychr, returnBias=False)
            mydict[key] = corrected
        mydict["resolution"] = self.resolution
Пример #3
0
 def prepare_for_PCA(self, data, boundaries=None, iterations=3):
     if boundaries is None:
         boundaries = self.boundaries
     from mirnylib.numutils import completeIC
     for i in range(iterations):
         newdata = self.substitute_intra_by_inter(data, boundaries)
         newdata = completeIC(newdata)
     return newdata
Пример #4
0
def showCmap():
    """Shows Hi-C data together with the simulated data. Hi-C data created by hiclib is needed for that,
    but you can replace the line mydict=h5dict()... and the following line with your own data loading code. """

 
    low = 60000 
    high = 75000
    lowMon = low * 1000 // 600 
    highMon = high * 1000 // 600 


    low20 = low // 10
    high20 = high // 10
    # here Hi-C data is loaded for display purposes only..... replace it with your own code if your data is in a different format
    mydict = h5dict("/home/magus/HiC2011/Erez2014/hg19/GM12878_inSitu-all-combined-10k_HighRes.byChr",'r')
    hicdata = mydict.get_dataset("13 13")[low20:high20, low20:high20]

    hicdata = completeIC(hicdata)
    curshape = hicdata.shape 
    newshape = (1000 * (high - low)) // (600 * 5)
    print(hicdata.shape, newshape)
    hicdata = zoomArray(hicdata, (newshape, newshape))
    hicdata = np.clip(hicdata, 0, np.percentile(hicdata, 99.99))
    hicdata /= np.mean(np.sum(hicdata, axis=1))

   #hicdata = hm / np.mean(np.sum(hm, axis=1))

    for fname in os.listdir("cmaps"):

        cmap = pickle.load(open(os.path.join("cmaps", fname), 'rb'))
        #arr = coarsegrain(cmap, 2)
        arr = cmap
        if arr.shape[0] != hicdata.shape[0]:
            continue
        print(arr.shape)


        arr = arr / np.mean(np.sum(arr, axis=1))
        ran = np.arange(len(arr))
        mask = ran[:,None] > ran[None,:]
        arr[mask] = hicdata[mask]

        logarr = np.log(arr + 0.0001)
        # noinspection PyTypeChecker
        plt.imshow(logarr, vmax = np.percentile(logarr, 99.99), vmin = np.percentile(logarr, 10), extent = [low, high, high, low], interpolation = "none")
        plt.savefig(os.path.join("heatmaps", fname+".png"))
        plt.savefig(os.path.join("heatmaps", fname+".pdf"))
        plt.show()
        plt.clf()
Пример #5
0
def doSaddleError(filename, eig, gen, correct=False):


    gen = Genome("/home/magus/HiC2011/data/" + gen, readChrms=["#", "X"])
    cur = 0
    data = h5dict(filename,'r')["heatmap"]
    if correct:
        data = completeIC(data)
    gen.setResolution(getResolution(filename))
    if eig == "GC":
        eig = np.concatenate(gen.GCBin)
    saddles = []
    permutted = []
    saddle = np.zeros((5,5), dtype = float)
    for i in range(100):
        permutted.append(np.zeros((5,5), dtype = float))

    for chrom in range(gen.chrmCount):
        st = gen.chrmStartsBinCont[chrom]
        end = gen.chrmEndsBinCont[chrom]
        cur = data[st:end, st:end]
        cur = observedOverExpected(cur)
        mask = np.sum(cur , axis=0) > 0
        cur = cur [mask]
        cur = cur [:, mask]
        GC = eig[st:end]
        GC = GC[mask]
        if len(GC) > 5:
            for i in range(5):
                for j in range(5):
                    G1, G2 = np.percentile(GC, [20 * i, 20 * i + 20])
                    mask1 = (GC > G1) * (GC < G2)
                    G1, G2 = np.percentile(GC, [20 * j, 20 * j + 20])
                    mask2 = (GC > G1) * (GC < G2)
                    addition = cur[np.ix_(mask1, mask2)]
                    addition = np.reshape(addition, (-1))
                    for k in range(100):
                        resampled = np.random.choice(addition, len(addition), replace=True)
                        permutted[k][i,j] += resampled.mean()
                    saddle[i, j] += addition.mean()
    return saddle, permutted
Пример #6
0
def run():
    # Parse Arguments
    args, commands = getargs()
    # Improve the performance if you don't want to run it
    if commands[0] not in ['-h', '--help']:
        ## Root Logger Configuration
        logger = logging.getLogger()
        # Logger Level
        logger.setLevel(10)
        console = logging.StreamHandler()
        filehandler = logging.handlers.RotatingFileHandler(args.logFile,
                                                           maxBytes=100000,
                                                           backupCount=5)
        # Set level for Handlers
        console.setLevel('INFO')
        filehandler.setLevel('DEBUG')
        # Customizing Formatter
        formatter = logging.Formatter(
            fmt='%(name)-14s %(levelname)-7s @ %(asctime)s: %(message)s',
            datefmt='%m/%d/%y %H:%M:%S')

        console.setFormatter(formatter)
        filehandler.setFormatter(formatter)
        # Add Handlers
        logger.addHandler(console)
        logger.addHandler(filehandler)

        ## Logging for argument setting
        arglist = [
            '# ARGUMENT LIST:',
            '# output file prefix = %s' % args.output,
            '# HiC Data Path = %s' % args.path,
            '# chromosomes = %s' % args.chroms,
            '# data resolution = %s' % args.resolution,
            '# Peak window width = %s' % args.pw,
            '# Donut size = %s' % args.ww,
            '# Maximum donut size = %s' % args.maxww,
            '# Significant Level = %s' % args.siglevel,
            '# Genomic distance range = %s' %
            [args.ww * args.resolution, args.maxapart]
        ]

        argtxt = '\n'.join(arglist)
        logger.info('\n' + argtxt)

        # Package Dependencies
        from mirnylib.numutils import completeIC

        logger.info('Locating Hi-C data ...')
        Lib = np.load(args.path)

        logger.info('Calling Peaks ...')
        OF = open('.'.join([args.output, 'peaks', 'txt']), 'wb')
        head = '\t'.join([
            'chromLabel', 'loc_1', 'loc_2', 'IF', 'D-Enrichment', 'D-pvalue',
            'D-qvalue', 'LL-Enrichment', 'LL-pvalue', 'LL-qvalue'
        ]) + '\n'
        OF.write(head)

        for key in Lib.files:
            if ((not args.chroms) or (key.isdigit() and '#' in args.chroms)
                    or (key in args.chroms)):
                logger.info('Chromosome %s ...', key)
                sparseH = Lib[key].reshape(1)[0]
                triuH = sparseH.toarray()
                H = triuH + triuH.T - np.diag(
                    triuH.diagonal())  # Symmetric Matrix
                del sparseH, triuH  # Release Memory
                logger.info('Perform ICE ...')
                cHeatMap, biases = completeIC(H, returnBias=True)
                logger.info('Done!')

                logger.info('Customize Sparse Matrix ...')
                chromLen = H.shape[0]
                num = args.maxapart // args.resolution + args.maxww + 1
                Diags = [np.diagonal(H, i) for i in np.arange(num)]
                M = sparse.diags(Diags, np.arange(num), format='csr')
                x = np.arange(args.ww, num)
                y = []
                cDiags = []
                for i in x:
                    diag = np.diagonal(cHeatMap, i)
                    y.append(diag.mean())
                    cDiags.append(diag)
                cM = sparse.diags(cDiags, x, format='csr')
                IR = isotonic.IsotonicRegression(increasing='auto')
                IR.fit(x, y)

                del H, cHeatMap

                Donuts, LL = pcaller(M,
                                     cM,
                                     biases,
                                     IR,
                                     chromLen,
                                     Diags,
                                     cDiags,
                                     num,
                                     pw=args.pw,
                                     ww=args.ww,
                                     sig=args.siglevel,
                                     maxww=args.maxww,
                                     maxapart=args.maxapart,
                                     res=args.resolution)

                for i in Donuts:
                    lineFormat = '%s\t%d\t%d\t%.4g\t%.4g\t%.4g\t%.4g\t%.4g\t%.4g\t%.4g\n'
                    contents = (key, ) + i + Donuts[i] + LL[i][1:]
                    line = lineFormat % contents
                    OF.write(line)

        OF.flush()
        OF.close()

        logger.info('Done!')
Пример #7
0
#!/usr/bin/python

import sys
import numpy as np
from mirnylib.numutils import completeIC

inp_file = sys.argv[1]
percent_cutoff = float(sys.argv[2])
out_file = sys.argv[3]

# read matrix
inp_matrix = np.loadtxt(inp_file)

# filter matrix by row mean
m = np.mean(inp_matrix, axis=1)
cutoff = np.sort(m)[int(round(len(m) * percent_cutoff))]
i = np.where(m < cutoff)
inp_matrix[i, :] = 0
inp_matrix[:, i] = 0

# IC
out_matrix = completeIC(inp_matrix)

np.savetxt(out_file, out_matrix,
           fmt='%.3e')  # TODO: save in scientific notation
Пример #8
0
def showCmapNew():
    """Saves a bunch of heatmaps at high resolutions."""

    plt.figure(figsize=(8,8))
    low = 60000
    high = 75000
    lowMon = low * 1000 // 600
    highMon = high * 1000 // 600


    low20 = low // 10
    high20 = high // 10
    mydict = h5dict("/home/magus/HiC2011/Erez2014/hg19/GM12878_inSitu-all-combined-10k_HighRes.byChr",'r')

    hicdata = mydict.get_dataset("13 13")[low20:high20, low20:high20]
    hicdata = completeIC(hicdata)
    curshape = hicdata.shape
    resolutionMon = 5
    newshape = (1000 * (high - low)) // (600 * resolutionMon)
    print(hicdata.shape, newshape)
    hicdata = zoomArray(hicdata, (newshape, newshape))
    hicdata = np.clip(hicdata, 0, np.percentile(hicdata, 99.99))
    hicdata /= np.mean(np.sum(hicdata, axis=1))


   #hicdata = hm / np.mean(np.sum(hm, axis=1))

    #for fname in os.listdir("cmaps"):
    for fname in ["cmapflagshipLifetime300Mu3_r=8.pkl"]:
        if ("r=8" not in fname) or ("Lifetime" not in fname):
            print("not going", fname)
            continue
        try:
            mu = float(fname.split("_r=")[0].split("Mu")[1])
        except:
            continue
        forw, rev = getForwBacv(mu)

        cmap = pickle.load(open(os.path.join("cmaps", fname), 'rb'))
        #arr = coarsegrain(cmap, 2)
        arr = cmap

        if arr.shape[0] != hicdata.shape[0]:
            continue
        arr = arr / np.mean(np.sum(arr,axis=1))
        hicdata *= 1.5
        diags = 1000
        print(arr.shape)
        ax = plt.subplot(211)
        turned = pivotHeatmap(arr, diags)[::-1] * 3
        turned2 = pivotHeatmap(hicdata, diags)
        turned = np.concatenate([turned, turned2], axis=0)
        myextent = [low, high, -(high - low) *  diags/ len(arr) , (high - low) *  diags/ len(arr) ]
        plt.imshow(np.log(turned + 0.0001) , aspect=0.5,cmap = "fall", vmax = -4, vmin = -8,
                   extent=myextent , interpolation = "none")
        #plt.colorbar()

        #plt.ylim([-(high - low) *  diags/ len(arr) , (high - low) *  diags/ len(arr) ])
        #nicePlot(show=False)

        plt.subplot(413, sharex = ax)
        xaxis=np.arange(len(forw)// 20) * 12 + 60000
        forwcg = coarsegrain(forw,20)
        revcg = coarsegrain(rev, 20)
        plt.vlines(xaxis[forwcg>0], 0, forwcg[forwcg>0], color = "blue")
        plt.vlines(xaxis[revcg>0], 0, revcg[revcg>0], color = "green")
        #plt.scatter(xaxis[forwcg>0], forwcg[forwcg>0], label = "forward CTCF")
        #plt.scatter(xaxis[revcg > 0],revcg[revcg>0], label = "reverse CTCF")
        plt.xlim([60000, 75000])
        plt.title(fname)
        plt.legend()
        plt.show()
        continue
        #nicePlot(show=False)
        #plt.subplot(414, sharex = ax)
        #plt.plot(xaxis, data)

        #plt.show()



        #arr = arr / np.mean(np.sum(arr, axis=1))
        #ran = np.arange(len(arr))
        #mask = ran[:,None] > ran[None,:]
        #arr[mask] = hicdata[mask]

        #logarr = np.log(arr + 0.0001)
        # noinspection PyTypeChecker
        #plt.imshow(logarr, vmax = np.percentile(logarr, 99.9), extent = [low, high, high, low], interpolation = "none")
        for st in range(60000, 75000, 1000):
            for size in [2000, 3000, 5000]:
                end = st + size
                if end > 75000:
                    continue
                plt.xlim([st, end])
                plt.savefig(os.path.join("heatmaps", "{0}_st={1}_end={2}_r=2.png".format(fname, st, end)))
                plt.savefig(os.path.join("heatmaps", "{0}_st={1}_end={2}_r=2.pdf".format(fname, st, end)))
        plt.clf()

    plt.show()
Пример #9
0
def displayHeatmap():
    plt.figure(figsize=(5, 5))
    shared_arr = mp.Array(ctypes.c_double, N**2)
    arr = tonumpyarray(shared_arr)
    arr.shape = (N, N)

    def doSim(i):
        nparr = tonumpyarray(shared_arr)
        SMCTran = initModel(i)

        for j in range(1):
            SMC = []
            N1 = 10000
            for k in range(np.random.randint(N1 // 2, N1)):
                SMCTran.steps(150)
                SMC.append(SMCTran.getSMCs())
            SMC = np.concatenate(SMC, axis=1)
            SMC1D = SMC[0] * N + SMC[1]
            position, counts = np.unique(SMC1D, return_counts=True)

            with shared_arr.get_lock():
                nparr[position] += counts
        print("Finished!")

        return None

    setExceptionHook()

    low20 = low // 10
    high20 = high // 10
    mydict = h5dict(
        "/home/magus/HiC2011/Erez2014/hg19/GM12878_inSitu-all-combined-10k_HighRes.byChr",
        'r')

    hicdata = mydict.get_dataset("13 13")[low20:high20, low20:high20]
    hicdata = completeIC(hicdata)
    curshape = hicdata.shape
    newshape = (1000 * (high - low)) // (600 * 20)
    print(hicdata.shape, newshape)
    hicdata = zoomArray(hicdata, (newshape, newshape))
    hicdata = np.clip(hicdata, 0, np.percentile(hicdata, 99.99))
    hicdata /= np.mean(np.sum(hicdata, axis=1))

    fmap(doSim, range(30),
         n=20)  # number of threads to use.  On a 20-core machine I use 20.

    arr = coarsegrain(arr, 20)
    arr = np.clip(arr, 0, np.percentile(arr, 99.9))
    arr /= np.mean(np.sum(arr, axis=1))

    ran = np.arange(len(arr))
    mask = ran[:, None] > ran[None, :]

    arr[mask] = hicdata[mask]

    logarr = np.log(arr + 0.0001)
    plt.imshow(logarr,
               vmax=np.percentile(logarr, 99.9),
               extent=[low, high, high, low],
               interpolation="none")
    nicePlot()
#!/usr/bin/python

import sys
import numpy as np 
from mirnylib.numutils import completeIC

inp_file = sys.argv[1]
percent_cutoff = float(sys.argv[2])
out_file = sys.argv[3]

# read matrix
inp_matrix = np.loadtxt(inp_file) 

# filter matrix by row mean
m = np.mean(inp_matrix,axis=1)
cutoff = np.sort(m)[len(m)*percent_cutoff]
i = np.where(m<cutoff)
inp_matrix[i,:] = 0
inp_matrix[:,i] = 0

# IC
out_matrix = completeIC(inp_matrix) 

np.savetxt(out_file,out_matrix,fmt='%.3e')          # TODO: save in scientific notation

Пример #11
0
def run():
     # Parse Arguments
    args, commands = getargs()
    # Improve the performance if you don't want to run it
    if commands[0] not in ['-h', '--help']:
        ## Root Logger Configuration
        logger = logging.getLogger()
        # Logger Level
        logger.setLevel(10)
        console = logging.StreamHandler()
        filehandler = logging.handlers.RotatingFileHandler(args.logFile,
                                                           maxBytes = 100000,
                                                           backupCount = 5)
        # Set level for Handlers
        console.setLevel('INFO')
        filehandler.setLevel('DEBUG')
        # Customizing Formatter
        formatter = logging.Formatter(fmt = '%(name)-14s %(levelname)-7s @ %(asctime)s: %(message)s',
                                      datefmt = '%m/%d/%y %H:%M:%S')
        
        console.setFormatter(formatter)
        filehandler.setFormatter(formatter)
        # Add Handlers
        logger.addHandler(console)
        logger.addHandler(filehandler)
        
        ## Logging for argument setting
        arglist = ['# ARGUMENT LIST:',
                   '# output file prefix = %s' % args.output,
                   '# HiC Data Path = %s' % args.path,
                   '# chromosomes = %s' % args.chroms,
                   '# data resolution = %s' % args.resolution,
                   '# Peak window width = %s' % args.pw,
                   '# Donut size = %s' % args.ww,
                   '# Maximum donut size = %s' % args.maxww,
                   '# Significant Level = %s' % args.siglevel,
                   '# Genomic distance range = %s' % [args.ww * args.resolution, args.maxapart]
                   ]
        
        argtxt = '\n'.join(arglist)
        logger.info('\n' + argtxt)
        
        # Package Dependencies
        from mirnylib.numutils import completeIC
        
        logger.info('Locating Hi-C data ...')
        Lib = np.load(args.path)
        
        logger.info('Calling Peaks ...')
        OF = open('.'.join([args.output, 'peaks', 'txt']), 'wb')
        head = '\t'.join(['chromLabel', 'loc_1', 'loc_2', 'IF', 'Fold-Enrichment', 'pvalue', 'qvalue']) + '\n'
        OF.write(head)
        
        for key in Lib.files:
            if ((not args.chroms) or (key.isdigit() and '#' in args.chroms) or (key in args.chroms)):
                logger.info('Chromosome %s ...', key)
                sparseH = Lib[key].reshape(1)[0]
                triuH = sparseH.toarray()
                H = triuH + triuH.T - np.diag(triuH.diagonal()) # Symmetric Matrix
                del sparseH, triuH # Release Memory
                logger.info('Perform ICE ...')
                cHeatMap, biases = completeIC(H, returnBias = True)
                logger.info('Done!')
                
                logger.info('Customize Sparse Matrix ...')
                chromLen = H.shape[0]
                num = args.maxapart // args.resolution + args.maxww + 1
                Diags = [np.diagonal(H, i) for i in np.arange(num)]
                M = sparse.diags(Diags, np.arange(num), format = 'csr')
                x = np.arange(args.ww, num)
                y = []
                cDiags = []
                for i in x:
                    diag = np.diagonal(cHeatMap, i)
                    y.append(diag.mean())
                    cDiags.append(diag)
                cM = sparse.diags(cDiags, x, format = 'csr')
                IR = isotonic.IsotonicRegression(increasing = 'auto')
                IR.fit(x, y)
                
                del H, cHeatMap
                
                xpos, ypos, Ovalues, Fold, pvalues, qvalues = pcaller(M, cM, biases, IR, chromLen, Diags, cDiags, num,
                                                                      pw = args.pw, ww = args.ww, sig = args.siglevel,
                                                                      maxww = args.maxww, maxapart = args.maxapart,
                                                                      res = args.resolution)
                                                                
                for i in xrange(xpos.size):
                    line = '%s\t%d\t%d\t%.4g\t%.4g\t%.4g\t%.4g\n' % (key, xpos[i] * args.resolution, ypos[i] * args.resolution, Ovalues[i], Fold[i], pvalues[i], qvalues[i])
                    OF.write(line)
                    
        OF.flush()
        OF.close()
                            
        logger.info('Done!')
Пример #12
0
filename = "MyFolder/byChromosomeHiCDataset"
#this only works with Hi-C dataset saved by chromosome

dataset = h5dict(filename, 'r')  #open in the "read" mode

#a bit of a weird way to find chromosome number
keys = dataset.keys()
cisKeys = [i for i in keys
           if len(set(i.split())) == 1]  #extract keys of the type "a a"
numChromosomes = len(cisKeys)

for chromosome in range(numChromosomes):

    chromosomeHeatmap = dataset["{0} {0}".format(
        chromosome)]  #extracting cis heatmap

    #This line executes proper Iterative Correction, which accounts for regions with low coverage
    #It only works for cis (symmetric) heatmaps.
    correctedHeatmap, bias = completeIC(chromosomeHeatmap, returnBias=True)

    #if you want to see log-heatmap, uncomment below
    #plt.imshow(np.log(correctedHeatmap))
    #plt.show()
    """
    Your code here
    """

    #Below is example of how to save data to txt
    #np.savetxt("corrected_chr{0}.txt.gz".format(chromosome),correctedHeatmap)  #example of saving chromosomes