Пример #1
0
    def modelNFR(self, boundary=115):
        """Model NFR distribution with exponential distribution"""
        b = np.where(
            self.fragmentsizes.get(self.lower, boundary) == max(
                self.fragmentsizes.get(self.lower,
                                       boundary)))[0][0] + 10 + self.lower

        def exp_pdf(x, *p):  #defines the PDF
            k = p[0]
            a = p[1]
            x = x - b
            return a * k * np.exp(-k * x)

        x = np.array(range(b, boundary))
        p0 = (.1, 1)
        coeff, var_matrix = optimize.curve_fit(exp_pdf,
                                               x,
                                               self.fragmentsizes.get(
                                                   b, boundary),
                                               p0=p0)
        nfr = np.concatenate((self.fragmentsizes.get(self.lower, boundary),
                              exp_pdf(np.array(range(boundary, self.upper)),
                                      *coeff)))
        nfr[nfr == 0] = min(nfr[nfr != 0]) * 0.01
        self.nfr_fit = FragmentSizes(self.lower, self.upper, vals=nfr)
        nuc = np.concatenate((np.zeros(boundary - self.lower),
                              self.fragmentsizes.get(boundary, self.upper) -
                              self.nfr_fit.get(boundary, self.upper)))
        nuc[nuc <= 0] = min(min(nfr) * 0.1, min(nuc[nuc > 0]) * 0.001)
        self.nuc_fit = FragmentSizes(self.lower, self.upper, vals=nuc)
Пример #2
0
 def test_normByInsertDist(self):
     """test that normalization by insert distribution works as expected"""
     isizes = FragmentSizes(lower=100,upper=200, vals = np.array(range(100,200)))
     self.biasmat.normByInsertDist(isizes)
     a1 = self.biastrack.get(pos = self.biasmat.start -50)
     a2 = self.biastrack.get(pos = self.biasmat.start + 50)
     correct = np.exp(a1+a2)*isizes.get(size = 101)
     self.assertTrue(abs(correct - self.biasmat.mat[1,0])<0.01*correct)
Пример #3
0
 def test_normByInsertDist(self):
     """test that normalization by insert distribution works as expected"""
     isizes = FragmentSizes(lower=100,
                            upper=200,
                            vals=np.array(range(100, 200)))
     self.biasmat.normByInsertDist(isizes)
     a1 = self.biastrack.get(pos=self.biasmat.start - 50)
     a2 = self.biastrack.get(pos=self.biasmat.start + 50)
     correct = np.exp(a1 + a2) * isizes.get(size=101)
     self.assertTrue(abs(correct - self.biasmat.mat[1, 0]) < 0.01 * correct)
Пример #4
0
 def setUp(self):
     """setup Test_occupancy class by establishing parameters"""
     self.fragment_dist = FragmentMixDistribution(0, 3)
     self.fragment_dist.nfr_fit = FragmentSizes(0,
                                                3,
                                                vals=np.array(
                                                    [0.5, 0.49, 0.01]))
     self.fragment_dist.nuc_fit = FragmentSizes(0,
                                                3,
                                                vals=np.array(
                                                    [0.01, 0.49, 0.5]))
     self.params = OccupancyCalcParams(0, 3, self.fragment_dist)
Пример #5
0
def run_diff(args, bases=500000):
    """run differential occupancy calling

    """
    chrs = read_chrom_sizes_from_bam(args.bam)
    pwm = PWM.open(args.pwm)
    chunks = ChunkList.read(args.bed,
                            chromDict=chrs,
                            min_offset=args.flank + args.upper / 2 +
                            max(pwm.up, pwm.down))
    chunks.merge()
    maxQueueSize = max(
        2, int(100 * bases / np.mean([chunk.length() for chunk in chunks])))
    #get fragmentsizes
    fragment_dist1 = FragmentMixDistribution(0, upper=args.upper)
    fragment_dist1.fragmentsizes = FragmentSizes(
        0, args.upper, vals=FragmentSizes.open(args.sizes1).get(0, args.upper))
    fragment_dist1.modelNFR()
    fragment_dist2 = FragmentMixDistribution(0, upper=args.upper)
    fragment_dist2.fragmentsizes = FragmentSizes(
        0, args.upper, vals=FragmentSizes.open(args.sizes2).get(0, args.upper))
    fragment_dist2.modelNFR()
    params = OccupancyParameters(fragment_dist,
                                 args.upper,
                                 args.fasta,
                                 args.pwm,
                                 sep=args.nuc_sep,
                                 min_occ=args.min_occ,
                                 flank=args.flank,
                                 bam=args.bam,
                                 ci=args.confidence_interval)
    sets = chunks.split(bases=bases)
    pool1 = mp.Pool(processes=max(1, args.cores - 1))
    diff_handle = open(args.out + '.occdiff.bed', 'w')
    diff_handle.close()
    diff_queue = mp.JoinableQueue()
    diff_process = mp.Process(target=_writeDiff, args=(diff_queue, args.out))
    diff_process.start()
    nuc_dist = np.zeros(args.upper)
    for j in sets:
        tmp = pool1.map(_occHelper, zip(j, itertools.repeat(params)))
        for result in tmp:
            diff_queue.put(result[1])
    pool1.close()
    pool1.join()
    diff_queue.put('STOP')
    diff_process.join()
    pysam.tabix_compress(args.out + '.occdiff.bed',
                         args.out + '.occdiff.bed.gz',
                         force=True)
    shell_command('rm ' + args.out + '.occdiff.bed')
    pysam.tabix_index(args.out + '.occdiff.bed.gz', preset="bed", force=True)
Пример #6
0
def run_vprocess(args):
    """process vplot

    """
    vmat=V.VMat.open(args.vplot)
    #Trim, Symmetrize
    vmat.trim(args.lower,args.upper,args.flank)
    vmat.symmetrize()
    #insert size norm
    if args.sizes is not None:
     #read in fragmentsizes
        nuc_dist = FragmentSizes.open(args.sizes)
        vmat.norm_y(nuc_dist)
    ##Smooth
    if args.smooth > 0:
        vmat.smooth(sd=args.smooth)
    #normalize
    vmat.norm()
    #Make extra plots if requeted
    if args.plot_extra:
        vmat.autoCorr()
        vmat.plot_auto(args.out+'.vplot.Autocorr.eps')
        vmat.converto1d()
        vmat.plot_1d(args.out+'.vplot.InsertionProfile.eps')
        vmat.plot_insertsize(args.out+'.vplot.InsertSizes.eps')
    #make plot and save
    vmat.save(args.out+".VMat")
    vmat.plot(filename = args.out+".VMat.eps")
Пример #7
0
def run_vprocess(args):
    """process vplot

    """
    vmat=V.VMat.open(args.vplot)
    #Trim, Symmetrize
    vmat.trim(args.lower,args.upper,args.flank)
    vmat.symmetrize()
    #insert size norm
    if args.sizes is not None:
     #read in fragmentsizes
        nuc_dist = FragmentSizes.open(args.sizes)
        vmat.norm_y(nuc_dist)
    ##Smooth
    if args.smooth > 0:
        vmat.smooth(sd=args.smooth)
    #normalize
    vmat.norm()
    #Make extra plots if requeted
    if args.plot_extra:
        vmat.autoCorr()
        vmat.plot_auto(args.out+'.vplot.Autocorr.eps')
        vmat.converto1d()
        vmat.plot_1d(args.out+'.vplot.InsertionProfile.eps')
        vmat.plot_insertsize(args.out+'.vplot.InsertSizes.eps')
    #make plot and save
    vmat.save(args.out+".VMat")
    vmat.plot(filename = args.out+".VMat.eps")
Пример #8
0
 def __init__(self, flank, lower, upper, bg, fasta, pwm, sizes, scale):
     self.flank = flank
     self.lower = lower
     self.upper = upper
     self.scale = scale
     self.bg = bg
     self.fasta = fasta
     if self.bg is None:
         self.pwm = PWM.open(pwm)
         self.chrs = read_chrom_sizes_from_fasta(fasta)
     self.fragmentsizes = FragmentSizes.open(sizes)
Пример #9
0
 def modelNFR(self, boundary = 115):
     """Model NFR distribution with exponential distribution"""
     b = np.where(self.fragmentsizes.get(self.lower,boundary) == max(self.fragmentsizes.get(self.lower,boundary)))[0][0]+10 + self.lower
     def exp_pdf(x,*p): #defines the PDF
         k=p[0]
         a=p[1]
         x=x-b
         return a*k*np.exp(-k*x)
     x = np.array(range(b,boundary))
     p0 = (.1,1)
     coeff, var_matrix = optimize.curve_fit(exp_pdf,x, self.fragmentsizes.get(b,boundary),
                                            p0=p0)
     nfr = np.concatenate((self.fragmentsizes.get(self.lower,boundary), exp_pdf(np.array(range(boundary,self.upper)),*coeff)))
     nfr[nfr==0] = min(nfr[nfr!=0])*0.01
     self.nfr_fit = FragmentSizes(self.lower,self.upper, vals = nfr)
     nuc = np.concatenate((np.zeros(boundary-self.lower),
                         self.fragmentsizes.get(boundary,self.upper) -
                         self.nfr_fit.get(boundary,self.upper)))
     nuc[nuc<=0]=min(min(nfr)*0.1,min(nuc[nuc>0])*0.001)
     self.nuc_fit = FragmentSizes(self.lower, self.upper, vals = nuc)
Пример #10
0
 def __init__(self, flank, lower, upper, bg, fasta, pwm, sizes, scale):
     self.flank = flank
     self.lower = lower
     self.upper = upper
     self.scale = scale
     self.bg = bg
     self.fasta = fasta
     if self.bg is None:
         self.pwm = PWM.open(pwm)
         self.chrs = read_chrom_sizes_from_fasta(fasta)
     self.fragmentsizes = FragmentSizes.open(sizes)
Пример #11
0
 def modelNFR(self, boundaries = (35,115)):
     """Model NFR distribution with gamma distribution"""
     b = np.where(self.fragmentsizes.get(self.lower,boundaries[1]) == max(self.fragmentsizes.get(self.lower,boundaries[1])))[0][0] + self.lower
     boundaries = (min(boundaries[0],b), boundaries[1])
     x = np.arange(boundaries[0],boundaries[1])        
     y = self.fragmentsizes.get(boundaries[0],boundaries[1]) 
     def gamma_fit(X,o,p):
         k = p[0]
         theta = p[1]
         a = p[2]
         x_mod = X-o
         res = np.zeros(len(x_mod))
         if k>=1:
             nz = x_mod >= 0
         else:
             nz = x_mod > 0
         res[nz] = a * x_mod[nz]**(k-1) * np.exp(-x_mod[nz]/theta) / (theta **k * gamma(k))
         return res 
     res_score = np.ones(boundaries[0]+1)*np.float('inf')
     res_param = [0 for i in range(boundaries[0]+1)]
     pranges = ((0.01,10),(0.01,150),(0.01,1))
     for i in range(15,boundaries[0]+1):
         f = lambda p: np.sum((gamma_fit(x,i,p) - y)**2)
         tmpres = optimize.brute(f, pranges,  full_output=True,
                           finish=optimize.fmin)
         res_score[i] = tmpres[1]
         res_param[i] = tmpres[0]
     whichres = np.argmin(res_score)
     res = res_param[whichres]
     self.nfr_fit0 = FragmentSizes(self.lower,self.upper, vals = gamma_fit(np.arange(self.lower,self.upper),whichres,res_param[whichres]))
     nfr = np.concatenate((self.fragmentsizes.get(self.lower,boundaries[1]), self.nfr_fit0.get(boundaries[1],self.upper))) 
     nfr[nfr==0] = min(nfr[nfr!=0])*0.01
     self.nfr_fit = FragmentSizes(self.lower,self.upper, vals = nfr)
     nuc = np.concatenate((np.zeros(boundaries[1]-self.lower),
                         self.fragmentsizes.get(boundaries[1],self.upper) -
                         self.nfr_fit.get(boundaries[1],self.upper)))
     nuc[nuc<=0]=min(min(nfr)*0.1,min(nuc[nuc>0])*0.001)
     self.nuc_fit = FragmentSizes(self.lower, self.upper, vals = nuc)
Пример #12
0
def run_diff(args, bases = 500000):
    """run differential occupancy calling

    """
    chrs = read_chrom_sizes_from_bam(args.bam)
    pwm = PWM.open(args.pwm)
    chunks = ChunkList.read(args.bed, chromDict = chrs, min_offset = args.flank + args.upper/2 + max(pwm.up,pwm.down))
    chunks.merge()
    maxQueueSize = max(2,int(100 * bases / np.mean([chunk.length() for chunk in chunks])))
    #get fragmentsizes
    fragment_dist1 = FragmentMixDistribution(0, upper = args.upper)
    fragment_dist1.fragmentsizes = FragmentSizes(0, args.upper, vals = FragmentSizes.open(args.sizes1).get(0,args.upper))
    fragment_dist1.modelNFR()
    fragment_dist2 = FragmentMixDistribution(0, upper = args.upper)
    fragment_dist2.fragmentsizes = FragmentSizes(0, args.upper, vals = FragmentSizes.open(args.sizes2).get(0,args.upper))
    fragment_dist2.modelNFR()
    params = OccupancyParameters(fragment_dist, args.upper, args.fasta, args.pwm, sep = args.nuc_sep, min_occ = args.min_occ,
            flank = args.flank, bam = args.bam, ci = args.confidence_interval)
    sets = chunks.split(bases = bases)
    pool1 = mp.Pool(processes = max(1,args.cores-1))
    diff_handle = open(args.out + '.occdiff.bed','w')
    diff_handle.close()
    diff_queue = mp.JoinableQueue()
    diff_process = mp.Process(target = _writeDiff, args=(diff_queue, args.out))
    diff_process.start()
    nuc_dist = np.zeros(args.upper)
    for j in sets:
        tmp = pool1.map(_occHelper, zip(j,itertools.repeat(params)))
        for result in tmp:
            diff_queue.put(result[1])
    pool1.close()
    pool1.join()
    diff_queue.put('STOP')
    diff_process.join()
    pysam.tabix_compress(args.out + '.occdiff.bed', args.out + '.occdiff.bed.gz',force = True)
    shell_command('rm ' + args.out + '.occdiff.bed')
    pysam.tabix_index(args.out + '.occdiff.bed.gz', preset = "bed", force = True)
Пример #13
0
    def modelNFR(self, boundaries=(35, 115)):
        """Model NFR distribution with gamma distribution"""
        b = np.where(
            self.fragmentsizes.get(self.lower, boundaries[1]) == max(
                self.fragmentsizes.get(self.lower,
                                       boundaries[1])))[0][0] + self.lower
        boundaries = (min(boundaries[0], b), boundaries[1])
        x = np.arange(boundaries[0], boundaries[1])
        y = self.fragmentsizes.get(boundaries[0], boundaries[1])

        def gamma_fit(X, o, p):
            k = p[0]
            theta = p[1]
            a = p[2]
            x_mod = X - o
            res = np.zeros(len(x_mod))
            if k >= 1:
                nz = x_mod >= 0
            else:
                nz = x_mod > 0
            res[nz] = a * x_mod[nz]**(k - 1) * np.exp(
                -x_mod[nz] / theta) / (theta**k * gamma(k))
            return res

        res_score = np.ones(boundaries[0] + 1) * np.float('inf')
        res_param = [0 for i in range(boundaries[0] + 1)]
        pranges = ((0.01, 10), (0.01, 150), (0.01, 1))
        for i in range(15, boundaries[0] + 1):
            f = lambda p: np.sum((gamma_fit(x, i, p) - y)**2)
            tmpres = optimize.brute(f,
                                    pranges,
                                    full_output=True,
                                    finish=optimize.fmin)
            res_score[i] = tmpres[1]
            res_param[i] = tmpres[0]
        whichres = np.argmin(res_score)
        res = res_param[whichres]
        self.nfr_fit0 = FragmentSizes(self.lower,
                                      self.upper,
                                      vals=gamma_fit(
                                          np.arange(self.lower, self.upper),
                                          whichres, res_param[whichres]))
        nfr = np.concatenate((self.fragmentsizes.get(self.lower,
                                                     boundaries[1]),
                              self.nfr_fit0.get(boundaries[1], self.upper)))
        nfr[nfr == 0] = min(nfr[nfr != 0]) * 0.01
        self.nfr_fit = FragmentSizes(self.lower, self.upper, vals=nfr)
        nuc = np.concatenate(
            (np.zeros(boundaries[1] - self.lower),
             self.fragmentsizes.get(boundaries[1], self.upper) -
             self.nfr_fit.get(boundaries[1], self.upper)))
        nuc[nuc <= 0] = min(min(nfr) * 0.1, min(nuc[nuc > 0]) * 0.001)
        self.nuc_fit = FragmentSizes(self.lower, self.upper, vals=nuc)
Пример #14
0
def get_sizes(args):
    """function to get fragment sizes

    """
    if args.out is None:
        args.out = '.'.join(os.path.basename(args.bam).split('.')[0:-1])
    sizes = FragmentSizes(lower=args.lower, upper=args.upper, atac=args.atac)
    if args.bed:
        chunks = ChunkList.read(args.bed)
        chunks.merge()
        sizes.calculateSizes(args.bam, chunks)
    else:
        sizes.calculateSizes(args.bam)
    sizes.save(args.out + '.fragmentsizes.txt')
    if not args.no_plot:
        #make figure
        fig = plt.figure()
        plt.plot(list(range(sizes.lower, sizes.upper)),
                 sizes.get(sizes.lower, sizes.upper),
                 label=args.out)
        plt.xlabel("Fragment Size")
        plt.ylabel("Frequency")
        fig.savefig(args.out + '.fragmentsizes.pdf')
        plt.close(fig)
Пример #15
0
def get_sizes(args):
    """function to get fragment sizes

    """
    if args.out is None:
        args.out = '.'.join(os.path.basename(args.bam).split('.')[0:-1])
    sizes = FragmentSizes(lower = args.lower, upper = args.upper, atac = args.atac)
    if args.bed:
        chunks = ChunkList.read(args.bed)
        chunks.merge()
        sizes.calculateSizes(args.bam, chunks)
    else:
        sizes.calculateSizes(args.bam)
    sizes.save(args.out+'.fragmentsizes.txt')
    if not args.no_plot:
        #make figure
        fig = plt.figure()
        plt.plot(range(sizes.lower,sizes.upper),sizes.get(sizes.lower,sizes.upper),label = args.out)
        plt.xlabel("Fragment Size")
        plt.ylabel("Frequency")
        fig.savefig(args.out+'.fragmentsizes.eps')
        plt.close(fig)
Пример #16
0
def run_occ(args):
    """run occupancy calling

    """
    if args.fasta:
        chrs = read_chrom_sizes_from_fasta(args.fasta)
    else:
        chrs = read_chrom_sizes_from_bam(args.bam)
    pwm = PWM.open(args.pwm)
    chunks = ChunkList.read(args.bed, chromDict = chrs, min_offset = args.flank + args.upper/2 + max(pwm.up,pwm.down) + args.nuc_sep/2)
    chunks.slop(chrs, up = args.nuc_sep/2, down = args.nuc_sep/2)
    chunks.merge()
    maxQueueSize = args.cores*10
    fragment_dist = FragmentMixDistribution(0, upper = args.upper)
    if args.sizes is not None:
        tmp = FragmentSizes.open(args.sizes)
        fragment_dist.fragmentsizes = FragmentSizes(0, args.upper, vals = tmp.get(0,args.upper))
    else:
        fragment_dist.getFragmentSizes(args.bam, chunks)
    fragment_dist.modelNFR()
    fragment_dist.plotFits(args.out + '.occ_fit.eps')
    fragment_dist.fragmentsizes.save(args.out + '.fragmentsizes.txt')
    params = OccupancyParameters(fragment_dist, args.upper, args.fasta, args.pwm, sep = args.nuc_sep, min_occ = args.min_occ,
            flank = args.flank, bam = args.bam, ci = args.confidence_interval, step = args.step)
    sets = chunks.split(items = args.cores * 5)
    pool1 = mp.Pool(processes = max(1,args.cores-1))
    out_handle1 = open(args.out + '.occ.bedgraph','w')
    out_handle1.close()
    out_handle2 = open(args.out + '.occ.lower_bound.bedgraph','w')
    out_handle2.close()
    out_handle3 = open(args.out + '.occ.upper_bound.bedgraph','w')
    out_handle3.close()
    write_queue = mp.JoinableQueue(maxsize = maxQueueSize)
    write_process = mp.Process(target = _writeOcc, args=(write_queue, args.out))
    write_process.start()
    peaks_handle = open(args.out + '.occpeaks.bed','w')
    peaks_handle.close()
    peaks_queue = mp.JoinableQueue()
    peaks_process = mp.Process(target = _writePeaks, args=(peaks_queue, args.out))
    peaks_process.start()
    nuc_dist = np.zeros(args.upper)
    for j in sets:
        tmp = pool1.map(_occHelper, zip(j,itertools.repeat(params)))
        for result in tmp:
            nuc_dist += result[0]
            write_queue.put(result[1])
            peaks_queue.put(result[2])
    pool1.close()
    pool1.join()
    write_queue.put('STOP')
    peaks_queue.put('STOP')
    write_process.join()
    peaks_process.join()
    pysam.tabix_compress(args.out + '.occpeaks.bed', args.out + '.occpeaks.bed.gz',force = True)
    shell_command('rm ' + args.out + '.occpeaks.bed')
    pysam.tabix_index(args.out + '.occpeaks.bed.gz', preset = "bed", force = True)
    for i in ('occ','occ.lower_bound','occ.upper_bound'):
        pysam.tabix_compress(args.out + '.' + i + '.bedgraph', args.out + '.'+i+'.bedgraph.gz',force = True)
        shell_command('rm ' + args.out + '.' + i + '.bedgraph')
        pysam.tabix_index(args.out + '.' + i + '.bedgraph.gz', preset = "bed", force = True)

    dist_out = FragmentSizes(0, args.upper, vals = nuc_dist)
    dist_out.save(args.out + '.nuc_dist.txt')

    print "Making figure"
    #make figure
    fig = plt.figure()
    plt.plot(range(0,args.upper),dist_out.get(0,args.upper),label = "Nucleosome Distribution")
    plt.xlabel("Fragment Size")
    plt.ylabel("Frequency")
    fig.savefig(args.out+'.nuc_dist.eps')
    plt.close(fig)
Пример #17
0
def run_nuc(args):
    """run occupancy calling

    """
    vmat = VMat.open(args.vmat)
    if args.fasta:
        chrs = read_chrom_sizes_from_fasta(args.fasta)
    else:
        chrs = read_chrom_sizes_from_bam(args.bam)
    pwm = PWM.open(args.pwm)
    chunks = ChunkList.read(args.bed,
                            chromDict=chrs,
                            min_offset=vmat.mat.shape[1] + vmat.upper // 2 +
                            max(pwm.up, pwm.down) + args.nuc_sep // 2,
                            min_length=args.nuc_sep * 2)
    chunks.slop(chrs, up=args.nuc_sep // 2, down=args.nuc_sep // 2)
    chunks.merge()
    maxQueueSize = args.cores * 10
    if args.sizes is not None:
        fragment_dist = FragmentSizes.open(args.sizes)
    else:
        fragment_dist = FragmentSizes(0, upper=vmat.upper)
        fragment_dist.calculateSizes(args.bam, chunks)
    params = NucParameters(vmat=vmat,
                           fragmentsizes=fragment_dist,
                           bam=args.bam,
                           fasta=args.fasta,
                           pwm=args.pwm,
                           occ_track=args.occ_track,
                           sd=args.sd,
                           nonredundant_sep=args.nuc_sep,
                           redundant_sep=args.redundant_sep,
                           min_z=args.min_z,
                           min_lr=args.min_lr,
                           atac=args.atac)
    sets = chunks.split(items=args.cores * 5)
    pool1 = mp.Pool(processes=max(1, args.cores - 1))
    if args.write_all:
        outputs = [
            'nucpos', 'nucpos.redundant', 'nucleoatac_signal',
            'nucleoatac_signal.smooth', 'nucleoatac_background',
            'nucleoatac_raw'
        ]
    else:
        outputs = [
            'nucpos', 'nucpos.redundant', 'nucleoatac_signal',
            'nucleoatac_signal.smooth'
        ]
    handles = {}
    write_queues = {}
    write_processes = {}
    for i in outputs:
        if i not in ['nucpos', 'nucpos.redundant', 'nfrpos']:
            handles[i] = open(args.out + '.' + i + '.bedgraph', 'w')
        else:
            handles[i] = open(args.out + '.' + i + '.bed', 'w')
        handles[i].close()
        write_queues[i] = mp.JoinableQueue(maxsize=maxQueueSize)
        write_processes[i] = mp.Process(target=_writeFuncs[i],
                                        args=(write_queues[i], args.out))
        write_processes[i].start()
    for j in sets:
        tmp = pool1.map(_nucHelper, list(zip(j, itertools.repeat(params))))
        for result in tmp:
            for i in outputs:
                write_queues[i].put(result[i])
    pool1.close()
    pool1.join()
    for i in outputs:
        write_queues[i].put('STOP')
    for i in outputs:
        write_processes[i].join()
        if i not in ['nucpos', 'nucpos.redundant']:
            pysam.tabix_compress(args.out + '.' + i + '.bedgraph',
                                 args.out + '.' + i + '.bedgraph.gz',
                                 force=True)
            shell_command('rm ' + args.out + '.' + i + '.bedgraph')
            pysam.tabix_index(args.out + '.' + i + '.bedgraph.gz',
                              preset="bed",
                              force=True)
        else:
            pysam.tabix_compress(args.out + '.' + i + '.bed',
                                 args.out + '.' + i + '.bed.gz',
                                 force=True)
            shell_command('rm ' + args.out + '.' + i + '.bed')
            pysam.tabix_index(args.out + '.' + i + '.bed.gz',
                              preset="bed",
                              force=True)
Пример #18
0
def run_nuc(args):
    """run occupancy calling

    """
    vmat = VMat.open(args.vmat)
    if args.fasta:
        chrs = read_chrom_sizes_from_fasta(args.fasta)
    else:
        chrs = read_chrom_sizes_from_bam(args.bam)
    pwm = PWM.open(args.pwm)
    chunks = ChunkList.read(args.bed, chromDict = chrs, min_offset = vmat.mat.shape[1] + vmat.upper/2 + max(pwm.up,pwm.down) + args.nuc_sep/2, min_length = args.nuc_sep * 2)
    chunks.slop(chrs, up = args.nuc_sep/2, down = args.nuc_sep/2)
    chunks.merge()
    maxQueueSize = args.cores*10
    if args.sizes is not None:
        fragment_dist = FragmentSizes.open(args.sizes)
    else:
        fragment_dist = FragmentSizes(0, upper = vmat.upper)
        fragment_dist.calculateSizes(args.bam, chunks)
    params = NucParameters(vmat = vmat, fragmentsizes = fragment_dist, bam = args.bam, fasta = args.fasta, pwm = args.pwm,
                           occ_track = args.occ_track,
                           sd = args.sd, nonredundant_sep = args.nuc_sep, redundant_sep = args.redundant_sep,
                           min_z = args.min_z, min_lr = args.min_lr , atac = args.atac)
    sets = chunks.split(items = args.cores*5)
    pool1 = mp.Pool(processes = max(1,args.cores-1))
    if args.write_all:
        outputs = ['nucpos','nucpos.redundant','nucleoatac_signal','nucleoatac_signal.smooth',
                       'nucleoatac_background','nucleoatac_raw']
    else:
        outputs = ['nucpos','nucpos.redundant','nucleoatac_signal','nucleoatac_signal.smooth']
    handles = {}
    write_queues = {}
    write_processes = {}
    for i in outputs:
        if i not in ['nucpos','nucpos.redundant','nfrpos']:
            handles[i] = open(args.out + '.'+i+'.bedgraph','w')
        else:
            handles[i] = open(args.out + '.'+i+'.bed','w')
        handles[i].close()
        write_queues[i] = mp.JoinableQueue(maxsize = maxQueueSize)
        write_processes[i] = mp.Process(target = _writeFuncs[i], args=(write_queues[i], args.out))
        write_processes[i].start()
    for j in sets:
        tmp = pool1.map(_nucHelper, zip(j,itertools.repeat(params)))
        for result in tmp:
            for i in outputs:
                write_queues[i].put(result[i])
    pool1.close()
    pool1.join()
    for i in outputs:
        write_queues[i].put('STOP')
    for i in outputs:
        write_processes[i].join()
        if i not in ['nucpos','nucpos.redundant']:
            pysam.tabix_compress(args.out + '.' + i + '.bedgraph', args.out +  '.' + i + '.bedgraph.gz',force = True)
            shell_command('rm ' + args.out +  '.' + i + '.bedgraph')
            pysam.tabix_index(args.out +  '.' + i + '.bedgraph.gz', preset = "bed", force = True)
        else:
            pysam.tabix_compress(args.out + '.' + i + '.bed', args.out +  '.' + i + '.bed.gz',force = True)
            shell_command('rm ' + args.out +  '.' + i + '.bed')
            pysam.tabix_index(args.out +  '.' + i + '.bed.gz', preset = "bed", force = True)
Пример #19
0
class FragmentMixDistribution:
    """Class for modelling insert size distribution"""
    def __init__(self,  lower = 0, upper =2000):
        self.lower = lower
        self.upper = upper
    def getFragmentSizes(self, bamfile, chunklist = None):
        self.fragmentsizes = FragmentSizes(self.lower, self.upper)
        self.fragmentsizes.calculateSizes(bamfile, chunks = chunklist)
    def modelNFR(self, boundary = 115):
        """Model NFR distribution with exponential distribution"""
        b = np.where(self.fragmentsizes.get(self.lower,boundary) == max(self.fragmentsizes.get(self.lower,boundary)))[0][0]+10 + self.lower
        def exp_pdf(x,*p): #defines the PDF
            k=p[0]
            a=p[1]
            x=x-b
            return a*k*np.exp(-k*x)
        x = np.array(range(b,boundary))
        p0 = (.1,1)
        coeff, var_matrix = optimize.curve_fit(exp_pdf,x, self.fragmentsizes.get(b,boundary),
                                               p0=p0)
        nfr = np.concatenate((self.fragmentsizes.get(self.lower,boundary), exp_pdf(np.array(range(boundary,self.upper)),*coeff)))
        nfr[nfr==0] = min(nfr[nfr!=0])*0.01
        self.nfr_fit = FragmentSizes(self.lower,self.upper, vals = nfr)
        nuc = np.concatenate((np.zeros(boundary-self.lower),
                            self.fragmentsizes.get(boundary,self.upper) -
                            self.nfr_fit.get(boundary,self.upper)))
        nuc[nuc<=0]=min(min(nfr)*0.1,min(nuc[nuc>0])*0.001)
        self.nuc_fit = FragmentSizes(self.lower, self.upper, vals = nuc)
    def plotFits(self,filename=None):
        """plot the Fits"""
        fig = plt.figure()
        plt.plot(range(self.lower,self.upper),self.fragmentsizes.get(),
                 label = "Observed")
        #plt.plot(range(self.lower,self.upper),self.smoothed.get(), label = "Smoothed")
        plt.plot(range(self.lower,self.upper),self.nuc_fit.get(), label = "Nucleosome Fit")
        plt.plot(range(self.lower,self.upper),self.nfr_fit.get(), label = "NFR Fit")
        plt.legend()
        plt.xlabel("Fragment size")
        plt.ylabel("Relative Frequency")
        if filename:
            fig.savefig(filename)
            plt.close(fig)
            #Also save text output!
            filename2 = ".".join(filename.split(".")[:-1]+['txt'])
            out = np.vstack((self.fragmentsizes.get(), #self.smoothed.get(),
                            self.nuc_fit.get(), self.nfr_fit.get()))
            np.savetxt(filename2,out,delimiter="\t")
        else:
            fig.show()
Пример #20
0
class FragmentMixDistribution:
    """Class for modelling insert size distribution"""
    def __init__(self, lower=0, upper=2000):
        self.lower = lower
        self.upper = upper

    def getFragmentSizes(self, bamfile, chunklist=None):
        self.fragmentsizes = FragmentSizes(self.lower, self.upper)
        self.fragmentsizes.calculateSizes(bamfile, chunks=chunklist)

    def modelNFR(self, boundary=115):
        """Model NFR distribution with exponential distribution"""
        b = np.where(
            self.fragmentsizes.get(self.lower, boundary) == max(
                self.fragmentsizes.get(self.lower,
                                       boundary)))[0][0] + 10 + self.lower

        def exp_pdf(x, *p):  #defines the PDF
            k = p[0]
            a = p[1]
            x = x - b
            return a * k * np.exp(-k * x)

        x = np.array(range(b, boundary))
        p0 = (.1, 1)
        coeff, var_matrix = optimize.curve_fit(exp_pdf,
                                               x,
                                               self.fragmentsizes.get(
                                                   b, boundary),
                                               p0=p0)
        nfr = np.concatenate((self.fragmentsizes.get(self.lower, boundary),
                              exp_pdf(np.array(range(boundary, self.upper)),
                                      *coeff)))
        nfr[nfr == 0] = min(nfr[nfr != 0]) * 0.01
        self.nfr_fit = FragmentSizes(self.lower, self.upper, vals=nfr)
        nuc = np.concatenate((np.zeros(boundary - self.lower),
                              self.fragmentsizes.get(boundary, self.upper) -
                              self.nfr_fit.get(boundary, self.upper)))
        nuc[nuc <= 0] = min(min(nfr) * 0.1, min(nuc[nuc > 0]) * 0.001)
        self.nuc_fit = FragmentSizes(self.lower, self.upper, vals=nuc)

    def plotFits(self, filename=None):
        """plot the Fits"""
        fig = plt.figure()
        plt.plot(range(self.lower, self.upper),
                 self.fragmentsizes.get(),
                 label="Observed")
        #plt.plot(range(self.lower,self.upper),self.smoothed.get(), label = "Smoothed")
        plt.plot(range(self.lower, self.upper),
                 self.nuc_fit.get(),
                 label="Nucleosome Fit")
        plt.plot(range(self.lower, self.upper),
                 self.nfr_fit.get(),
                 label="NFR Fit")
        plt.legend()
        plt.xlabel("Fragment size")
        plt.ylabel("Relative Frequency")
        if filename:
            fig.savefig(filename)
            plt.close(fig)
            #Also save text output!
            filename2 = ".".join(filename.split(".")[:-1] + ['txt'])
            out = np.vstack((
                self.fragmentsizes.get(),  #self.smoothed.get(),
                self.nuc_fit.get(),
                self.nfr_fit.get()))
            np.savetxt(filename2, out, delimiter="\t")
        else:
            fig.show()
Пример #21
0
class FragmentMixDistribution:
    """Class for modelling insert size distribution"""
    def __init__(self,  lower = 0, upper =2000):
        self.lower = lower
        self.upper = upper
    def getFragmentSizes(self, bamfile, chunklist = None):
        self.fragmentsizes = FragmentSizes(self.lower, self.upper)
        self.fragmentsizes.calculateSizes(bamfile, chunks = chunklist)
    def modelNFR(self, boundaries = (35,115)):
        """Model NFR distribution with gamma distribution"""
        b = np.where(self.fragmentsizes.get(self.lower,boundaries[1]) == max(self.fragmentsizes.get(self.lower,boundaries[1])))[0][0] + self.lower
        boundaries = (min(boundaries[0],b), boundaries[1])
        x = np.arange(boundaries[0],boundaries[1])        
        y = self.fragmentsizes.get(boundaries[0],boundaries[1]) 
        def gamma_fit(X,o,p):
            k = p[0]
            theta = p[1]
            a = p[2]
            x_mod = X-o
            res = np.zeros(len(x_mod))
            if k>=1:
                nz = x_mod >= 0
            else:
                nz = x_mod > 0
            res[nz] = a * x_mod[nz]**(k-1) * np.exp(-x_mod[nz]/theta) / (theta **k * gamma(k))
            return res 
        res_score = np.ones(boundaries[0]+1)*np.float('inf')
        res_param = [0 for i in range(boundaries[0]+1)]
        pranges = ((0.01,10),(0.01,150),(0.01,1))
        for i in range(15,boundaries[0]+1):
            f = lambda p: np.sum((gamma_fit(x,i,p) - y)**2)
            tmpres = optimize.brute(f, pranges,  full_output=True,
                              finish=optimize.fmin)
            res_score[i] = tmpres[1]
            res_param[i] = tmpres[0]
        whichres = np.argmin(res_score)
        res = res_param[whichres]
        self.nfr_fit0 = FragmentSizes(self.lower,self.upper, vals = gamma_fit(np.arange(self.lower,self.upper),whichres,res_param[whichres]))
        nfr = np.concatenate((self.fragmentsizes.get(self.lower,boundaries[1]), self.nfr_fit0.get(boundaries[1],self.upper))) 
        nfr[nfr==0] = min(nfr[nfr!=0])*0.01
        self.nfr_fit = FragmentSizes(self.lower,self.upper, vals = nfr)
        nuc = np.concatenate((np.zeros(boundaries[1]-self.lower),
                            self.fragmentsizes.get(boundaries[1],self.upper) -
                            self.nfr_fit.get(boundaries[1],self.upper)))
        nuc[nuc<=0]=min(min(nfr)*0.1,min(nuc[nuc>0])*0.001)
        self.nuc_fit = FragmentSizes(self.lower, self.upper, vals = nuc)
    def plotFits(self,filename=None):
        """plot the Fits"""
        fig = plt.figure()
        plt.plot(range(self.lower,self.upper),self.fragmentsizes.get(),
                 label = "Observed")
        plt.plot(range(self.lower,self.upper),self.nfr_fit0.get(), label = "NFR Fit")
        plt.plot(range(self.lower,self.upper),self.nuc_fit.get(), label = "Nucleosome Model")
        plt.plot(range(self.lower,self.upper),self.nfr_fit.get(), label = "NFR Model")
        plt.legend()
        plt.xlabel("Fragment size")
        plt.ylabel("Relative Frequency")
        if filename:
            fig.savefig(filename)
            plt.close(fig)
            #Also save text output!
            filename2 = ".".join(filename.split(".")[:-1]+['txt'])
            out = np.vstack((self.fragmentsizes.get(), #self.smoothed.get(),
                            self.nuc_fit.get(), self.nfr_fit.get()))
            np.savetxt(filename2,out,delimiter="\t")
        else:
            fig.show()
Пример #22
0
 def getFragmentSizes(self, bamfile, chunklist = None):
     self.fragmentsizes = FragmentSizes(self.lower, self.upper)
     self.fragmentsizes.calculateSizes(bamfile, chunks = chunklist)