Пример #1
0
def averageLoopsWithControl(loopPositions, filename, cg=1, pad = 8 ):
    mymaps =  averageLoops(loopPositions, filename, pad = pad)
    mymaps2 = averageLoops(controlLoops(loopPositions), filename, pad = pad )
    "bla26"
    if cg != 1:
        mymaps = [coarsegrain(1. * i, cg) / (cg ** 2) for i in mymaps]
        mymaps2 = [coarsegrain(1. * i, cg) / (cg ** 2) for i in mymaps2]
    return mymaps, mymaps2
def plot_with_annotations(arr, title, cmap, peaks, contactDomains, forw, rev, showPlot=True, maxPercentile=99.9, extendFactor=0.10):
    fig, ax0 = plt.subplots()
    im0 = ax0.matshow(arr, cmap=cmap, vmax = np.nanpercentile(arr, maxPercentile))
    ax0.set_title(title, y=1.08)
    ax0.set_ylim(64,-20)
    fig.colorbar(im0, ax=ax0)

    # CTCF site information
    trim = extendFactor
    npoints = len(forw)
    orig_size = int(npoints / (1 + (trim * 2)))
    remove_each = int((npoints - orig_size) / 2)
    forwTrim = forw[remove_each:npoints - remove_each]
    forwTrimC = coarsegrain(forwTrim, 50)
    revTrim = rev[remove_each:npoints - remove_each]
    revTrimC = coarsegrain(revTrim, 50)
    forwTrimC[forwTrimC >1] =1
    revTrimC[revTrimC >1] =1
    # where on the chart we're plotting
    siteStart = -10 
    siteExpand = 9
    # add lines for CTCF sites
    ax0.vlines([np.where(forwTrimC != 0)], ymin=siteStart, ymax=siteStart - (forwTrimC[np.where(forwTrimC != 0)] * siteExpand), color='red')
    ax0.vlines([np.where(revTrimC != 0)], ymin=siteStart, ymax=siteStart + (revTrimC[np.where(revTrimC != 0)] * siteExpand), color='blue')

    # add lines all the way down the plot
    ax0.vlines([np.where(forwTrimC != 0)], ymin=-20, ymax=[np.where(forwTrimC != 0)], color='magenta', lw=0.5, linestyles='dashed')
    ax0.vlines([np.where(revTrimC != 0)], ymin=-20, ymax=[np.where(revTrimC != 0)], color='aqua', lw=0.5, linestyles='dashed')
    ax0.hlines([np.where(forwTrimC != 0)], xmax=arr.shape[0]-1, xmin=[np.where(forwTrimC != 0)], color='magenta', lw=0.5, linestyles='dashed')
    ax0.hlines([np.where(revTrimC != 0)], xmax=arr.shape[0]-1, xmin=[np.where(revTrimC != 0)], color='aqua', lw=0.5, linestyles='dashed')

    # add information on domains and peak calls from HiC
    for i in range(contactDomains.shape[0]):
        side = (contactDomains['x2'][i] - contactDomains['x1'][i])
        xy=[contactDomains['x1'][i], contactDomains['x1'][i] + side]
        p = patches.Rectangle([x - 0.5 for x in xy], width = side, height= -1*side,
            fill=False, edgecolor='palegreen')
        ax0.add_patch(p)
    for i in range(peaks.shape[0]):
        side = 1
        xy=[peaks['x1'][i] - 0.5, peaks['y1'][i] + 0.5]
        p = patches.Rectangle([x for x in xy], width = side, height= -1*side,
            fill=False, edgecolor='palegreen')
        ax0.add_patch(p)

    for i in range(peaks.shape[0]):
        side = 1
        xy=[peaks['y1'][i] - 0.5,peaks['x1'][i] + 0.5]
        p = patches.Rectangle([x for x in xy], width = side, height= -1*side,
            fill=False, edgecolor='palegreen')
        ax0.add_patch(p)

    if showPlot:
        plt.show(block=False)
def do_extruder_position(forw, rev, SEPARATION=200,
                         LIFETIME=300, nsim=10, trim=0, bin_size=0):
    """Return extuder positioning after nsim steps of simulation

    returns:
    logarr: log-transformed array of extruder positioning
    """

    # number of monomers to simulate
    N = len(forw)
    shared_arr = mp.Array(ctypes.c_double, N**2)
    arr = tonumpyarray(shared_arr)
    arr.shape = (N, N)

    # can do parallel with fmap, not doing that here
    # setExceptionHook()
    # fmap(doSim, range(30), n = 1 )  # number of threads to use.
    # On a 20-core machine I use 20.
    [doSim(i, N, SEPARATION, LIFETIME, shared_arr, forw, rev)
     for i in range(nsim)]

    # trim before coarsegraining, if desired
    if trim > 0:
        print('trimming ' + str(arr.shape))
        npoints = arr.shape[0]
        origSize = int(npoints / (1 + (trim * 2)))
        removeTotal = npoints - origSize
        if removeTotal % 2 != 0:
            removeLeft = int(np.floor(removeTotal / 2))
            removeRight = int(np.ceil(removeTotal / 2))
        else:
            removeLeft = removeRight = removeTotal // 2

        arr = arr[removeLeft:npoints - removeRight,
                  removeLeft:npoints - removeRight]
        print('done trimming ' + str(arr.shape))

    # bin to a lower resolution if desired
    if bin_size > 0:
        arr = coarsegrain(arr, bin_size)

    arr = np.clip(arr, 0, np.percentile(arr, 99.9))
    arr /= np.mean(np.sum(arr, axis=1))
    logarr = np.log(arr + 0.0001)
    return(logarr)
Пример #4
0
def saddlePlot():
    "plot of values ordered by Eig1GW"

    #plt.figure(figsize = (1.5,1.5))
    plt.figure(figsize=(3, 3))
    Tanay = binnedData(1000000)
    Tanay.simpleLoad("../data/GM-all-hg18-1M", "GM-all")
    Tanay.removeDiagonal(1)
    Tanay.removePoorRegions()
    Tanay.removeZeros()
    Tanay.fakeCis()
    Tanay.iterativeCorrectWithoutSS()
    Tanay.doEig()
    PC = Tanay.EIG["GM-all"][:, 0]
    if PC[0] > 0:
        PC = -PC

    def reorder(data, array=PC):
        inds = numpy.argsort(array)
        ndata = data[inds, :]
        return ndata[:, inds]
    toplot = (coarsegrain(reorder(Tanay.dataDict["GM-all"]), 60))
    toplot /= toplot.mean()
    toplot = numpy.log(toplot)
    sh = toplot.shape
    toplot = toplot.reshape((-1))
    ag = numpy.argmax(toplot)
    toplot[ag] = 0
    toplot[ag] = numpy.max(toplot)
    toplot.shape = sh
    toplot[0, -1] = toplot[0, -2]
    toplot[-1, 0] = toplot[-2, 0]
    plt.imshow(toplot, vmin=toplot.min(), vmax=toplot.max(),
               interpolation="nearest")
    cbar = plt.colorbar(orientation="vertical")
    #labels = ["10","100","1000","10000"]
    #cbar.ax.set_xticklabels(labels)
    cbar.ax.set_xlabel("Log(relative contact probability)", fontsize=6)
    for xlabel_i in cbar.ax.get_xticklabels():
        xlabel_i.set_fontsize(6)
    cbar.set_ticks([-0.5, 0, 0.5, 1])
    removeBorder()
    mirnylib.plotting.niceShow()
Пример #5
0
def refine_paper(filename, create=True):
    """filename[0] is a list of filenames of incoming files
    filename[1] is a folder for outgoing file"""
    if create == True:
        for onename in filename[0]:
            #Parsing individual files
            if not os.path.exists(onename):
                raise StandardError("path not found: %s" % onename)
            TR = HiCdataset("bla", genome=genomeFolder, enzymeName="HindIII",maximumMoleculeLength=500, inMemory=True)
            print "\nTesting loading new data without rsite information    "
            TR.parseInputData(dictLike=onename,
                              enzymeToFillRsites="HindIII")
            #assert len(TR.DS) == 856143

            #assert len(TR.ufragments) == 634572
            TR.save(onename + "_parsed.frag")

        #Merging files alltogether, applying filters
        TR = HiCdataset(filename[1] + "_merged.frag",enzymeName = "HindIII",
                        genome=genomeFolder, mode="w")
        TR.merge([i + "_parsed.frag" for i in filename[0]])

        TR = HiCdataset("refined", genome=genomeFolder,enzymeName = "HindIII",
                        mode="w", inMemory=True)

        print "\nTesting chunking during all tests"
        TR.chunksize = 30000
        #because we do many operations, we disable autoFlush here
        TR.load(filename[1] + "_merged.frag")

        print "\nTesting Rsite filter"
        TR.filterRsiteStart(offset=5)

        #assert len(TR.DS) == 832110

        print "\nTesting duplicate filter"
        TR.filterDuplicates(chunkSize = 30000)        

        #assert len(TR.DS) == 830275

        print "\nTesting small/large and extreme fragment filter"
        TR.filterLarge()

        #assert len(TR.DS) == 825442
        TR.filterExtreme(cutH=0.005, cutL=0)
        TR.writeFilteringStats()

        #assert len(TR.DS) == 803845


    #-------------------------------------------
    TR.printMetadata(saveTo="metadata")
    import cPickle

    stop = False
    mdata = cPickle.load(open("sampleMetadata"))
    for i in sorted(mdata.keys()):
        if TR.metadata[i] != mdata[i]:
            print "Key {0} is not consistent: should be {1}, is {2}".format(i, mdata[i], TR.metadata[i])
            stop = True
    if stop == True:
        print ("""------------_ERROR_--------------
        Inconsistent metadata: see above
        ----------------------------------------""")
        raise ValueError("Inconsistent Metadata")


    print "Testing allxall and by-chromosome heatmap counting diagonal twice"

    print "----> saving allxall heatmap"
    TR.saveHeatmap(filename[1] + "-1M.hm", 1000000,
                   countDiagonalReads="twice")
    a = h5dict(filename[1] + "-1M.hm")
    st, end = TR.genome.chrmStartsBinCont[1], TR.genome.chrmEndsBinCont[1]
    st2, end2 = TR.genome.chrmStartsBinCont[2], TR.genome.chrmEndsBinCont[2]
    chrom1 = a["heatmap"][st:end, st:end]
    chrom12 = a["heatmap"][st:end, st2:end2]
    setExceptionHook()
    print "----> saving by chromosome heatmap"
    TR.saveByChromosomeHeatmap(
        filename[1] + "-1M.hm", resolution=1000000, includeTrans=True,
        countDiagonalReads="twice")

    b = h5dict(filename[1] + "-1M.hm")["1 1"]
    bb = h5dict(filename[1] + "-1M.hm")["1 2"]
    assert (b - chrom1).sum() == 0
    print "Cis heatmap consistent"
    assert (bb - chrom12).sum() == 0
    print 'Trans heatmap consistent'
    print  a["heatmap"][::10, ::10].sum()
    #assert  a["heatmap"][::10, ::10].sum() == 21800
    print "Heatmap sum correct\n"

    #---------------------------------
    print "Testing allxall and by-chromosome heatmap counting diagonal once"

    TR.saveHeatmap(filename[1] + "-1M.hm", 1000000,
                   countDiagonalReads="once")
    Ta = h5dict(filename[1] + "-1M.hm")
    st, end = TR.genome.chrmStartsBinCont[1], TR.genome.chrmEndsBinCont[1]
    st2, end2 = TR.genome.chrmStartsBinCont[2], TR.genome.chrmEndsBinCont[2]
    chrom1 = Ta["heatmap"][st:end, st:end]
    chrom12 = Ta["heatmap"][st:end, st2:end2]
    setExceptionHook()
    print "----> saving by chromosome heatmap"
    TR.saveByChromosomeHeatmap(
        filename[1] + "-1M-byChr.hm", resolution=1000000, includeTrans=True,
        countDiagonalReads="once")
    
    TR.saveHiResHeatmapWithOverlaps(filename[1]+"-1M-highRes.hm", resolution=50000, countDiagonalReads="twice")
    TR.saveSuperHighResMapWithOverlaps(filename[1]+"-5k-SuperHighRes.hm", resolution=5000,chromosomes = [14], countDiagonalReads="twice")

    Tb = h5dict(filename[1] + "-1M-byChr.hm")["1 1"]
    Tbb = h5dict(filename[1] + "-1M-byChr.hm")["1 2"]
    assert ((Tb - chrom1) == 0).all()
    assert ((Tbb - chrom12) == 0).all()
    assert ((Tb + np.diag(np.diag(Tb))) == b).all()
    print "Diagonal counting methods are consistent\n"
    
    newchrom1 = chrom1.copy()
    for i in xrange(len(newchrom1)):
        newchrom1[i,i] = 2 * newchrom1[i,i]
    
    Tb = h5dict(filename[1] + "-1M-highRes.hm")["1 1"]
    assert np.abs(Tb.sum() - newchrom1.sum()) < 1
    assert np.sum(np.abs(coarsegrain(Tb,20,True) - newchrom1)) < 500
    

    #------------------------------
    print "Testing updateGenome method"
    from mirnylib.genome import Genome
    removeChromIDs = np.array([0, 1, 1, 1, 1] + [0] * 17 + [1] + [0])
    #print ((removeChromIDs[TR.chrms1] == 1) + (removeChromIDs[TR.chrms2] == 1) ).sum()
    t = ((removeChromIDs[TR.chrms1] == 1) * (removeChromIDs[TR.chrms2] == 1)).sum() + ((removeChromIDs[TR.chrms1] == 1) * (TR.chrms2 == -1)).sum()
    newGenome = Genome(genomePath=genomeFolder, readChrms=["2",
                                                           "3", "4", "5", "X"])
    TR.updateGenome(newGenome)
    assert  TR.N == t

    a = h5dict(filename[1] + "-1M.hm")["heatmap"]
Пример #6
0
def showCmapNew():
    """Saves a bunch of heatmaps at high resolutions."""

    plt.figure(figsize=(8,8))
    low = 60000
    high = 75000
    lowMon = low * 1000 // 600
    highMon = high * 1000 // 600


    low20 = low // 10
    high20 = high // 10
    mydict = h5dict("/home/magus/HiC2011/Erez2014/hg19/GM12878_inSitu-all-combined-10k_HighRes.byChr",'r')

    hicdata = mydict.get_dataset("13 13")[low20:high20, low20:high20]
    hicdata = completeIC(hicdata)
    curshape = hicdata.shape
    resolutionMon = 5
    newshape = (1000 * (high - low)) // (600 * resolutionMon)
    print(hicdata.shape, newshape)
    hicdata = zoomArray(hicdata, (newshape, newshape))
    hicdata = np.clip(hicdata, 0, np.percentile(hicdata, 99.99))
    hicdata /= np.mean(np.sum(hicdata, axis=1))


   #hicdata = hm / np.mean(np.sum(hm, axis=1))

    #for fname in os.listdir("cmaps"):
    for fname in ["cmapflagshipLifetime300Mu3_r=8.pkl"]:
        if ("r=8" not in fname) or ("Lifetime" not in fname):
            print("not going", fname)
            continue
        try:
            mu = float(fname.split("_r=")[0].split("Mu")[1])
        except:
            continue
        forw, rev = getForwBacv(mu)

        cmap = pickle.load(open(os.path.join("cmaps", fname), 'rb'))
        #arr = coarsegrain(cmap, 2)
        arr = cmap

        if arr.shape[0] != hicdata.shape[0]:
            continue
        arr = arr / np.mean(np.sum(arr,axis=1))
        hicdata *= 1.5
        diags = 1000
        print(arr.shape)
        ax = plt.subplot(211)
        turned = pivotHeatmap(arr, diags)[::-1] * 3
        turned2 = pivotHeatmap(hicdata, diags)
        turned = np.concatenate([turned, turned2], axis=0)
        myextent = [low, high, -(high - low) *  diags/ len(arr) , (high - low) *  diags/ len(arr) ]
        plt.imshow(np.log(turned + 0.0001) , aspect=0.5,cmap = "fall", vmax = -4, vmin = -8,
                   extent=myextent , interpolation = "none")
        #plt.colorbar()

        #plt.ylim([-(high - low) *  diags/ len(arr) , (high - low) *  diags/ len(arr) ])
        #nicePlot(show=False)

        plt.subplot(413, sharex = ax)
        xaxis=np.arange(len(forw)// 20) * 12 + 60000
        forwcg = coarsegrain(forw,20)
        revcg = coarsegrain(rev, 20)
        plt.vlines(xaxis[forwcg>0], 0, forwcg[forwcg>0], color = "blue")
        plt.vlines(xaxis[revcg>0], 0, revcg[revcg>0], color = "green")
        #plt.scatter(xaxis[forwcg>0], forwcg[forwcg>0], label = "forward CTCF")
        #plt.scatter(xaxis[revcg > 0],revcg[revcg>0], label = "reverse CTCF")
        plt.xlim([60000, 75000])
        plt.title(fname)
        plt.legend()
        plt.show()
        continue
        #nicePlot(show=False)
        #plt.subplot(414, sharex = ax)
        #plt.plot(xaxis, data)

        #plt.show()



        #arr = arr / np.mean(np.sum(arr, axis=1))
        #ran = np.arange(len(arr))
        #mask = ran[:,None] > ran[None,:]
        #arr[mask] = hicdata[mask]

        #logarr = np.log(arr + 0.0001)
        # noinspection PyTypeChecker
        #plt.imshow(logarr, vmax = np.percentile(logarr, 99.9), extent = [low, high, high, low], interpolation = "none")
        for st in range(60000, 75000, 1000):
            for size in [2000, 3000, 5000]:
                end = st + size
                if end > 75000:
                    continue
                plt.xlim([st, end])
                plt.savefig(os.path.join("heatmaps", "{0}_st={1}_end={2}_r=2.png".format(fname, st, end)))
                plt.savefig(os.path.join("heatmaps", "{0}_st={1}_end={2}_r=2.pdf".format(fname, st, end)))
        plt.clf()

    plt.show()
Пример #7
0
def displayHeatmap():
    plt.figure(figsize=(5, 5))
    shared_arr = mp.Array(ctypes.c_double, N**2)
    arr = tonumpyarray(shared_arr)
    arr.shape = (N, N)

    def doSim(i):
        nparr = tonumpyarray(shared_arr)
        SMCTran = initModel(i)

        for j in range(1):
            SMC = []
            N1 = 10000
            for k in range(np.random.randint(N1 // 2, N1)):
                SMCTran.steps(150)
                SMC.append(SMCTran.getSMCs())
            SMC = np.concatenate(SMC, axis=1)
            SMC1D = SMC[0] * N + SMC[1]
            position, counts = np.unique(SMC1D, return_counts=True)

            with shared_arr.get_lock():
                nparr[position] += counts
        print("Finished!")

        return None

    setExceptionHook()

    low20 = low // 10
    high20 = high // 10
    mydict = h5dict(
        "/home/magus/HiC2011/Erez2014/hg19/GM12878_inSitu-all-combined-10k_HighRes.byChr",
        'r')

    hicdata = mydict.get_dataset("13 13")[low20:high20, low20:high20]
    hicdata = completeIC(hicdata)
    curshape = hicdata.shape
    newshape = (1000 * (high - low)) // (600 * 20)
    print(hicdata.shape, newshape)
    hicdata = zoomArray(hicdata, (newshape, newshape))
    hicdata = np.clip(hicdata, 0, np.percentile(hicdata, 99.99))
    hicdata /= np.mean(np.sum(hicdata, axis=1))

    fmap(doSim, range(30),
         n=20)  # number of threads to use.  On a 20-core machine I use 20.

    arr = coarsegrain(arr, 20)
    arr = np.clip(arr, 0, np.percentile(arr, 99.9))
    arr /= np.mean(np.sum(arr, axis=1))

    ran = np.arange(len(arr))
    mask = ran[:, None] > ran[None, :]

    arr[mask] = hicdata[mask]

    logarr = np.log(arr + 0.0001)
    plt.imshow(logarr,
               vmax=np.percentile(logarr, 99.9),
               extent=[low, high, high, low],
               interpolation="none")
    nicePlot()
Пример #8
0
def plot_logarr_sites(logarr,
                      forw,
                      rev,
                      title='',
                      cmap='viridis',
                      max_percentile=99.9,
                      extend_factor=0.10,
                      coarsegrain_factor=50,
                      plot_CTCF_lines=True,
                      save_plot=None):
    '''Plot the log transformed array of extrusion occupancy with a
    track of the stall sites above the plot.

    Arguments:
    logarr: log extrusion occupancy matrix
    forw: forward array of stall sites
    rev: reverse array of stall sites

    Kwargs:
    title: plot title
    cmap: plot colormap
    max_percentile: threshold colormap at this percentile
    extenc_fator: for simulaion, how much did the region get extended
    coarsegrain_factor: Binning size for final plot
    plot_CTCF_lines: plot lines ontop of the matrix for each CTCF site
    save_plot: if a string, saves the plot at this file
    '''

    # CTCF site information
    # trim and coarsegrain
    trim = extend_factor
    npoints = len(forw)
    orig_size = int(npoints / (1 + (trim * 2)))
    remove_each = int((npoints - orig_size) / 2)
    forwTrim = forw[remove_each:npoints - remove_each]
    forwTrimC = coarsegrain(forwTrim, coarsegrain_factor)
    revTrim = rev[remove_each:npoints - remove_each]
    revTrimC = coarsegrain(revTrim, coarsegrain_factor)
    forwTrimC[forwTrimC > 1] = 1
    revTrimC[revTrimC > 1] = 1

    # init plot
    fig, ax0 = plt.subplots(ncols=1)

    # set vmax lims
    use_vmax = np.nanpercentile(logarr, max_percentile)
    use_vmin = np.nanmin(logarr)

    # symmetrize logarr
    keep = logarr[np.triu_indices(logarr.shape[0])]
    logarr = logarr.transpose()
    logarr[np.triu_indices(logarr.shape[0])] = keep

    # set up loarr plot
    im0 = ax0.matshow(logarr, cmap=cmap, vmax=use_vmax, vmin=use_vmin)
    ax0.set_title(title, y=1.08)
    # ctcf site plot should extend 1/4 beyond
    ctcf_plot_size = int(logarr.shape[0] * 0.25)
    ax0.set_ylim(logarr.shape[0] - 1, -1 * ctcf_plot_size)
    fig.colorbar(im0, ax=ax0)
    # where on the chart we're plotting
    siteStart = -1 * (ctcf_plot_size / 2)
    siteExpand = (ctcf_plot_size / 2) * 0.9
    # add lines for CTCF sites
    ax0.vlines([np.where(forwTrimC != 0)],
               ymin=siteStart,
               ymax=siteStart -
               (forwTrimC[np.where(forwTrimC != 0)] * siteExpand),
               color='red')
    ax0.vlines([np.where(revTrimC != 0)],
               ymin=siteStart,
               ymax=siteStart +
               (revTrimC[np.where(revTrimC != 0)] * siteExpand),
               color='blue')
    if plot_CTCF_lines:
        # add lines all the way down the plot
        ax0.vlines([np.where(forwTrimC != 0)],
                   ymin=-20,
                   ymax=[np.where(forwTrimC != 0)],
                   color='magenta',
                   lw=0.5,
                   linestyles='dashed')
        ax0.vlines([np.where(revTrimC != 0)],
                   ymin=-20,
                   ymax=[np.where(revTrimC != 0)],
                   color='aqua',
                   lw=0.5,
                   linestyles='dashed')
        ax0.hlines([np.where(forwTrimC != 0)],
                   xmax=logarr.shape[0] - 1,
                   xmin=[np.where(forwTrimC != 0)],
                   color='magenta',
                   lw=0.5,
                   linestyles='dashed')
        ax0.hlines([np.where(revTrimC != 0)],
                   xmax=logarr.shape[0] - 1,
                   xmin=[np.where(revTrimC != 0)],
                   color='aqua',
                   lw=0.5,
                   linestyles='dashed')

    plt.tight_layout()
    if save_plot is not None:
        plt.savefig(save_plot)
    else:
        plt.show(block=False)
im0 = ax0.matshow(logarr, vmax = np.percentile(logarr, 99.9))
ax0.plot([i for i in range(len(arrDI))], [(i*-1)-10 for i in arrDI], lw=0.75)
ax0.plot([i for i in range(len(arrDI))], [-10 for i in arrDI], ls='--', color='black', lw=0.5)
# fig.colorbar(im0, ax=ax0)
ax0.set_title('log extrusion occupancy', y=1.08)
# ax0.vlines([i+0.5 for i in range(65)], ymin=-20, ymax=64.5, linestyles='dashed', lw=0.5, color='grey')
# fig.colorbar(im0, ax=ax0)
ax0.set_ylim(64,-20)

# can also plot some ctcf stuff
trim = extendFactor
npoints = len(forw)
orig_size = int(npoints / (1 + (trim * 2)))
remove_each = int((npoints - orig_size) / 2)
forwTrim = forw[remove_each:npoints - remove_each]
forwTrimC = coarsegrain(forwTrim, 50)
revTrim = rev[remove_each:npoints - remove_each]
revTrimC = coarsegrain(revTrim, 50)

siteStart = -10 
siteExpand = 9
ax0.vlines([np.where(forwTrimC != 0)], ymin=siteStart, ymax=siteStart - (forwTrimC[np.where(forwTrimC != 0)] * siteExpand), color='red')
ax0.vlines([np.where(revTrimC != 0)], ymin=siteStart, ymax=siteStart + (revTrimC[np.where(revTrimC != 0)] * siteExpand), color='blue')
plt.show(block=False)

doDiCompare(logarr, arr, manyContactAv,'/home/ben/ab_local/directional_model/extrusion_fitting/diCompare/diSites_d')

# we can just export this as a file to use with the normal simulation code on Windows
saveDf = pd.DataFrame({'chrom'      : 'chr21', 
                       'start'      : binStarts[contactDIScaled != 0],
                       'end'        : binEnds[contactDIScaled != 0],
Пример #10
0
def refine_paper(filename, create=True):
    """filename[0] is a list of filenames of incoming files
    filename[1] is a folder for outgoing file"""
    if create == True:
        for onename in filename[0]:
            #Parsing individual files
            if not os.path.exists(onename):
                raise StandardError("path not found: %s" % onename)
            TR = HiCdataset("bla",
                            genome=genomeFolder,
                            enzymeName="HindIII",
                            maximumMoleculeLength=500,
                            inMemory=True)
            print "\nTesting loading new data without rsite information    "
            TR.parseInputData(dictLike=onename, enzymeToFillRsites="HindIII")
            #assert len(TR.DS) == 856143

            #assert len(TR.ufragments) == 634572
            TR.save(onename + "_parsed.frag")

        #Merging files alltogether, applying filters
        TR = HiCdataset(filename[1] + "_merged.frag",
                        enzymeName="HindIII",
                        genome=genomeFolder,
                        mode="w")
        TR.merge([i + "_parsed.frag" for i in filename[0]])

        TR = HiCdataset("refined",
                        genome=genomeFolder,
                        enzymeName="HindIII",
                        mode="w",
                        inMemory=True)

        print "\nTesting chunking during all tests"
        TR.chunksize = 30000
        #because we do many operations, we disable autoFlush here
        TR.load(filename[1] + "_merged.frag")

        print "\nTesting Rsite filter"
        TR.filterRsiteStart(offset=5)

        #assert len(TR.DS) == 832110

        print "\nTesting duplicate filter"
        TR.filterDuplicates(chunkSize=30000)

        #assert len(TR.DS) == 830275

        print "\nTesting small/large and extreme fragment filter"
        TR.filterLarge()

        #assert len(TR.DS) == 825442
        TR.filterExtreme(cutH=0.005, cutL=0)
        TR.writeFilteringStats()

        #assert len(TR.DS) == 803845

    #-------------------------------------------
    TR.printMetadata(saveTo="metadata")
    import cPickle

    stop = False
    mdata = cPickle.load(open("sampleMetadata"))
    for i in sorted(mdata.keys()):
        if TR.metadata[i] != mdata[i]:
            print "Key {0} is not consistent: should be {1}, is {2}".format(
                i, mdata[i], TR.metadata[i])
            stop = True
    if stop == True:
        print("""------------_ERROR_--------------
        Inconsistent metadata: see above
        ----------------------------------------""")
        raise ValueError("Inconsistent Metadata")

    print "Testing allxall and by-chromosome heatmap counting diagonal twice"

    print "----> saving allxall heatmap"
    TR.saveHeatmap(filename[1] + "-1M.hm", 1000000, countDiagonalReads="twice")
    a = h5dict(filename[1] + "-1M.hm")
    st, end = TR.genome.chrmStartsBinCont[1], TR.genome.chrmEndsBinCont[1]
    st2, end2 = TR.genome.chrmStartsBinCont[2], TR.genome.chrmEndsBinCont[2]
    chrom1 = a["heatmap"][st:end, st:end]
    chrom12 = a["heatmap"][st:end, st2:end2]
    setExceptionHook()
    print "----> saving by chromosome heatmap"
    TR.saveByChromosomeHeatmap(filename[1] + "-1M.hm",
                               resolution=1000000,
                               includeTrans=True,
                               countDiagonalReads="twice")

    b = h5dict(filename[1] + "-1M.hm")["1 1"]
    bb = h5dict(filename[1] + "-1M.hm")["1 2"]
    assert (b - chrom1).sum() == 0
    print "Cis heatmap consistent"
    assert (bb - chrom12).sum() == 0
    print 'Trans heatmap consistent'
    print a["heatmap"][::10, ::10].sum()
    #assert  a["heatmap"][::10, ::10].sum() == 21800
    print "Heatmap sum correct\n"

    #---------------------------------
    print "Testing allxall and by-chromosome heatmap counting diagonal once"

    TR.saveHeatmap(filename[1] + "-1M.hm", 1000000, countDiagonalReads="once")
    Ta = h5dict(filename[1] + "-1M.hm")
    st, end = TR.genome.chrmStartsBinCont[1], TR.genome.chrmEndsBinCont[1]
    st2, end2 = TR.genome.chrmStartsBinCont[2], TR.genome.chrmEndsBinCont[2]
    chrom1 = Ta["heatmap"][st:end, st:end]
    chrom12 = Ta["heatmap"][st:end, st2:end2]
    setExceptionHook()
    print "----> saving by chromosome heatmap"
    TR.saveByChromosomeHeatmap(filename[1] + "-1M-byChr.hm",
                               resolution=1000000,
                               includeTrans=True,
                               countDiagonalReads="once")

    TR.saveHiResHeatmapWithOverlaps(filename[1] + "-1M-highRes.hm",
                                    resolution=50000,
                                    countDiagonalReads="twice")
    TR.saveSuperHighResMapWithOverlaps(filename[1] + "-5k-SuperHighRes.hm",
                                       resolution=5000,
                                       chromosomes=[14],
                                       countDiagonalReads="twice")

    Tb = h5dict(filename[1] + "-1M-byChr.hm")["1 1"]
    Tbb = h5dict(filename[1] + "-1M-byChr.hm")["1 2"]
    assert ((Tb - chrom1) == 0).all()
    assert ((Tbb - chrom12) == 0).all()
    assert ((Tb + np.diag(np.diag(Tb))) == b).all()
    print "Diagonal counting methods are consistent\n"

    newchrom1 = chrom1.copy()
    for i in xrange(len(newchrom1)):
        newchrom1[i, i] = 2 * newchrom1[i, i]

    Tb = h5dict(filename[1] + "-1M-highRes.hm")["1 1"]
    assert np.abs(Tb.sum() - newchrom1.sum()) < 1
    assert np.sum(np.abs(coarsegrain(Tb, 20, True) - newchrom1)) < 500

    #------------------------------
    print "Testing updateGenome method"
    from mirnylib.genome import Genome
    removeChromIDs = np.array([0, 1, 1, 1, 1] + [0] * 17 + [1] + [0])
    #print ((removeChromIDs[TR.chrms1] == 1) + (removeChromIDs[TR.chrms2] == 1) ).sum()
    t = ((removeChromIDs[TR.chrms1] == 1) *
         (removeChromIDs[TR.chrms2] == 1)).sum() + (
             (removeChromIDs[TR.chrms1] == 1) * (TR.chrms2 == -1)).sum()
    newGenome = Genome(genomePath=genomeFolder,
                       readChrms=["2", "3", "4", "5", "X"])
    TR.updateGenome(newGenome)
    assert TR.N == t

    a = h5dict(filename[1] + "-1M.hm")["heatmap"]