def modelNFR(self, boundary=115): """Model NFR distribution with exponential distribution""" b = np.where( self.fragmentsizes.get(self.lower, boundary) == max( self.fragmentsizes.get(self.lower, boundary)))[0][0] + 10 + self.lower def exp_pdf(x, *p): #defines the PDF k = p[0] a = p[1] x = x - b return a * k * np.exp(-k * x) x = np.array(range(b, boundary)) p0 = (.1, 1) coeff, var_matrix = optimize.curve_fit(exp_pdf, x, self.fragmentsizes.get( b, boundary), p0=p0) nfr = np.concatenate((self.fragmentsizes.get(self.lower, boundary), exp_pdf(np.array(range(boundary, self.upper)), *coeff))) nfr[nfr == 0] = min(nfr[nfr != 0]) * 0.01 self.nfr_fit = FragmentSizes(self.lower, self.upper, vals=nfr) nuc = np.concatenate((np.zeros(boundary - self.lower), self.fragmentsizes.get(boundary, self.upper) - self.nfr_fit.get(boundary, self.upper))) nuc[nuc <= 0] = min(min(nfr) * 0.1, min(nuc[nuc > 0]) * 0.001) self.nuc_fit = FragmentSizes(self.lower, self.upper, vals=nuc)
def test_normByInsertDist(self): """test that normalization by insert distribution works as expected""" isizes = FragmentSizes(lower=100,upper=200, vals = np.array(range(100,200))) self.biasmat.normByInsertDist(isizes) a1 = self.biastrack.get(pos = self.biasmat.start -50) a2 = self.biastrack.get(pos = self.biasmat.start + 50) correct = np.exp(a1+a2)*isizes.get(size = 101) self.assertTrue(abs(correct - self.biasmat.mat[1,0])<0.01*correct)
def test_normByInsertDist(self): """test that normalization by insert distribution works as expected""" isizes = FragmentSizes(lower=100, upper=200, vals=np.array(range(100, 200))) self.biasmat.normByInsertDist(isizes) a1 = self.biastrack.get(pos=self.biasmat.start - 50) a2 = self.biastrack.get(pos=self.biasmat.start + 50) correct = np.exp(a1 + a2) * isizes.get(size=101) self.assertTrue(abs(correct - self.biasmat.mat[1, 0]) < 0.01 * correct)
def setUp(self): """setup Test_occupancy class by establishing parameters""" self.fragment_dist = FragmentMixDistribution(0, 3) self.fragment_dist.nfr_fit = FragmentSizes(0, 3, vals=np.array( [0.5, 0.49, 0.01])) self.fragment_dist.nuc_fit = FragmentSizes(0, 3, vals=np.array( [0.01, 0.49, 0.5])) self.params = OccupancyCalcParams(0, 3, self.fragment_dist)
def run_diff(args, bases=500000): """run differential occupancy calling """ chrs = read_chrom_sizes_from_bam(args.bam) pwm = PWM.open(args.pwm) chunks = ChunkList.read(args.bed, chromDict=chrs, min_offset=args.flank + args.upper / 2 + max(pwm.up, pwm.down)) chunks.merge() maxQueueSize = max( 2, int(100 * bases / np.mean([chunk.length() for chunk in chunks]))) #get fragmentsizes fragment_dist1 = FragmentMixDistribution(0, upper=args.upper) fragment_dist1.fragmentsizes = FragmentSizes( 0, args.upper, vals=FragmentSizes.open(args.sizes1).get(0, args.upper)) fragment_dist1.modelNFR() fragment_dist2 = FragmentMixDistribution(0, upper=args.upper) fragment_dist2.fragmentsizes = FragmentSizes( 0, args.upper, vals=FragmentSizes.open(args.sizes2).get(0, args.upper)) fragment_dist2.modelNFR() params = OccupancyParameters(fragment_dist, args.upper, args.fasta, args.pwm, sep=args.nuc_sep, min_occ=args.min_occ, flank=args.flank, bam=args.bam, ci=args.confidence_interval) sets = chunks.split(bases=bases) pool1 = mp.Pool(processes=max(1, args.cores - 1)) diff_handle = open(args.out + '.occdiff.bed', 'w') diff_handle.close() diff_queue = mp.JoinableQueue() diff_process = mp.Process(target=_writeDiff, args=(diff_queue, args.out)) diff_process.start() nuc_dist = np.zeros(args.upper) for j in sets: tmp = pool1.map(_occHelper, zip(j, itertools.repeat(params))) for result in tmp: diff_queue.put(result[1]) pool1.close() pool1.join() diff_queue.put('STOP') diff_process.join() pysam.tabix_compress(args.out + '.occdiff.bed', args.out + '.occdiff.bed.gz', force=True) shell_command('rm ' + args.out + '.occdiff.bed') pysam.tabix_index(args.out + '.occdiff.bed.gz', preset="bed", force=True)
def run_vprocess(args): """process vplot """ vmat=V.VMat.open(args.vplot) #Trim, Symmetrize vmat.trim(args.lower,args.upper,args.flank) vmat.symmetrize() #insert size norm if args.sizes is not None: #read in fragmentsizes nuc_dist = FragmentSizes.open(args.sizes) vmat.norm_y(nuc_dist) ##Smooth if args.smooth > 0: vmat.smooth(sd=args.smooth) #normalize vmat.norm() #Make extra plots if requeted if args.plot_extra: vmat.autoCorr() vmat.plot_auto(args.out+'.vplot.Autocorr.eps') vmat.converto1d() vmat.plot_1d(args.out+'.vplot.InsertionProfile.eps') vmat.plot_insertsize(args.out+'.vplot.InsertSizes.eps') #make plot and save vmat.save(args.out+".VMat") vmat.plot(filename = args.out+".VMat.eps")
def __init__(self, flank, lower, upper, bg, fasta, pwm, sizes, scale): self.flank = flank self.lower = lower self.upper = upper self.scale = scale self.bg = bg self.fasta = fasta if self.bg is None: self.pwm = PWM.open(pwm) self.chrs = read_chrom_sizes_from_fasta(fasta) self.fragmentsizes = FragmentSizes.open(sizes)
def modelNFR(self, boundary = 115): """Model NFR distribution with exponential distribution""" b = np.where(self.fragmentsizes.get(self.lower,boundary) == max(self.fragmentsizes.get(self.lower,boundary)))[0][0]+10 + self.lower def exp_pdf(x,*p): #defines the PDF k=p[0] a=p[1] x=x-b return a*k*np.exp(-k*x) x = np.array(range(b,boundary)) p0 = (.1,1) coeff, var_matrix = optimize.curve_fit(exp_pdf,x, self.fragmentsizes.get(b,boundary), p0=p0) nfr = np.concatenate((self.fragmentsizes.get(self.lower,boundary), exp_pdf(np.array(range(boundary,self.upper)),*coeff))) nfr[nfr==0] = min(nfr[nfr!=0])*0.01 self.nfr_fit = FragmentSizes(self.lower,self.upper, vals = nfr) nuc = np.concatenate((np.zeros(boundary-self.lower), self.fragmentsizes.get(boundary,self.upper) - self.nfr_fit.get(boundary,self.upper))) nuc[nuc<=0]=min(min(nfr)*0.1,min(nuc[nuc>0])*0.001) self.nuc_fit = FragmentSizes(self.lower, self.upper, vals = nuc)
def modelNFR(self, boundaries = (35,115)): """Model NFR distribution with gamma distribution""" b = np.where(self.fragmentsizes.get(self.lower,boundaries[1]) == max(self.fragmentsizes.get(self.lower,boundaries[1])))[0][0] + self.lower boundaries = (min(boundaries[0],b), boundaries[1]) x = np.arange(boundaries[0],boundaries[1]) y = self.fragmentsizes.get(boundaries[0],boundaries[1]) def gamma_fit(X,o,p): k = p[0] theta = p[1] a = p[2] x_mod = X-o res = np.zeros(len(x_mod)) if k>=1: nz = x_mod >= 0 else: nz = x_mod > 0 res[nz] = a * x_mod[nz]**(k-1) * np.exp(-x_mod[nz]/theta) / (theta **k * gamma(k)) return res res_score = np.ones(boundaries[0]+1)*np.float('inf') res_param = [0 for i in range(boundaries[0]+1)] pranges = ((0.01,10),(0.01,150),(0.01,1)) for i in range(15,boundaries[0]+1): f = lambda p: np.sum((gamma_fit(x,i,p) - y)**2) tmpres = optimize.brute(f, pranges, full_output=True, finish=optimize.fmin) res_score[i] = tmpres[1] res_param[i] = tmpres[0] whichres = np.argmin(res_score) res = res_param[whichres] self.nfr_fit0 = FragmentSizes(self.lower,self.upper, vals = gamma_fit(np.arange(self.lower,self.upper),whichres,res_param[whichres])) nfr = np.concatenate((self.fragmentsizes.get(self.lower,boundaries[1]), self.nfr_fit0.get(boundaries[1],self.upper))) nfr[nfr==0] = min(nfr[nfr!=0])*0.01 self.nfr_fit = FragmentSizes(self.lower,self.upper, vals = nfr) nuc = np.concatenate((np.zeros(boundaries[1]-self.lower), self.fragmentsizes.get(boundaries[1],self.upper) - self.nfr_fit.get(boundaries[1],self.upper))) nuc[nuc<=0]=min(min(nfr)*0.1,min(nuc[nuc>0])*0.001) self.nuc_fit = FragmentSizes(self.lower, self.upper, vals = nuc)
def run_diff(args, bases = 500000): """run differential occupancy calling """ chrs = read_chrom_sizes_from_bam(args.bam) pwm = PWM.open(args.pwm) chunks = ChunkList.read(args.bed, chromDict = chrs, min_offset = args.flank + args.upper/2 + max(pwm.up,pwm.down)) chunks.merge() maxQueueSize = max(2,int(100 * bases / np.mean([chunk.length() for chunk in chunks]))) #get fragmentsizes fragment_dist1 = FragmentMixDistribution(0, upper = args.upper) fragment_dist1.fragmentsizes = FragmentSizes(0, args.upper, vals = FragmentSizes.open(args.sizes1).get(0,args.upper)) fragment_dist1.modelNFR() fragment_dist2 = FragmentMixDistribution(0, upper = args.upper) fragment_dist2.fragmentsizes = FragmentSizes(0, args.upper, vals = FragmentSizes.open(args.sizes2).get(0,args.upper)) fragment_dist2.modelNFR() params = OccupancyParameters(fragment_dist, args.upper, args.fasta, args.pwm, sep = args.nuc_sep, min_occ = args.min_occ, flank = args.flank, bam = args.bam, ci = args.confidence_interval) sets = chunks.split(bases = bases) pool1 = mp.Pool(processes = max(1,args.cores-1)) diff_handle = open(args.out + '.occdiff.bed','w') diff_handle.close() diff_queue = mp.JoinableQueue() diff_process = mp.Process(target = _writeDiff, args=(diff_queue, args.out)) diff_process.start() nuc_dist = np.zeros(args.upper) for j in sets: tmp = pool1.map(_occHelper, zip(j,itertools.repeat(params))) for result in tmp: diff_queue.put(result[1]) pool1.close() pool1.join() diff_queue.put('STOP') diff_process.join() pysam.tabix_compress(args.out + '.occdiff.bed', args.out + '.occdiff.bed.gz',force = True) shell_command('rm ' + args.out + '.occdiff.bed') pysam.tabix_index(args.out + '.occdiff.bed.gz', preset = "bed", force = True)
def modelNFR(self, boundaries=(35, 115)): """Model NFR distribution with gamma distribution""" b = np.where( self.fragmentsizes.get(self.lower, boundaries[1]) == max( self.fragmentsizes.get(self.lower, boundaries[1])))[0][0] + self.lower boundaries = (min(boundaries[0], b), boundaries[1]) x = np.arange(boundaries[0], boundaries[1]) y = self.fragmentsizes.get(boundaries[0], boundaries[1]) def gamma_fit(X, o, p): k = p[0] theta = p[1] a = p[2] x_mod = X - o res = np.zeros(len(x_mod)) if k >= 1: nz = x_mod >= 0 else: nz = x_mod > 0 res[nz] = a * x_mod[nz]**(k - 1) * np.exp( -x_mod[nz] / theta) / (theta**k * gamma(k)) return res res_score = np.ones(boundaries[0] + 1) * np.float('inf') res_param = [0 for i in range(boundaries[0] + 1)] pranges = ((0.01, 10), (0.01, 150), (0.01, 1)) for i in range(15, boundaries[0] + 1): f = lambda p: np.sum((gamma_fit(x, i, p) - y)**2) tmpres = optimize.brute(f, pranges, full_output=True, finish=optimize.fmin) res_score[i] = tmpres[1] res_param[i] = tmpres[0] whichres = np.argmin(res_score) res = res_param[whichres] self.nfr_fit0 = FragmentSizes(self.lower, self.upper, vals=gamma_fit( np.arange(self.lower, self.upper), whichres, res_param[whichres])) nfr = np.concatenate((self.fragmentsizes.get(self.lower, boundaries[1]), self.nfr_fit0.get(boundaries[1], self.upper))) nfr[nfr == 0] = min(nfr[nfr != 0]) * 0.01 self.nfr_fit = FragmentSizes(self.lower, self.upper, vals=nfr) nuc = np.concatenate( (np.zeros(boundaries[1] - self.lower), self.fragmentsizes.get(boundaries[1], self.upper) - self.nfr_fit.get(boundaries[1], self.upper))) nuc[nuc <= 0] = min(min(nfr) * 0.1, min(nuc[nuc > 0]) * 0.001) self.nuc_fit = FragmentSizes(self.lower, self.upper, vals=nuc)
def get_sizes(args): """function to get fragment sizes """ if args.out is None: args.out = '.'.join(os.path.basename(args.bam).split('.')[0:-1]) sizes = FragmentSizes(lower=args.lower, upper=args.upper, atac=args.atac) if args.bed: chunks = ChunkList.read(args.bed) chunks.merge() sizes.calculateSizes(args.bam, chunks) else: sizes.calculateSizes(args.bam) sizes.save(args.out + '.fragmentsizes.txt') if not args.no_plot: #make figure fig = plt.figure() plt.plot(list(range(sizes.lower, sizes.upper)), sizes.get(sizes.lower, sizes.upper), label=args.out) plt.xlabel("Fragment Size") plt.ylabel("Frequency") fig.savefig(args.out + '.fragmentsizes.pdf') plt.close(fig)
def get_sizes(args): """function to get fragment sizes """ if args.out is None: args.out = '.'.join(os.path.basename(args.bam).split('.')[0:-1]) sizes = FragmentSizes(lower = args.lower, upper = args.upper, atac = args.atac) if args.bed: chunks = ChunkList.read(args.bed) chunks.merge() sizes.calculateSizes(args.bam, chunks) else: sizes.calculateSizes(args.bam) sizes.save(args.out+'.fragmentsizes.txt') if not args.no_plot: #make figure fig = plt.figure() plt.plot(range(sizes.lower,sizes.upper),sizes.get(sizes.lower,sizes.upper),label = args.out) plt.xlabel("Fragment Size") plt.ylabel("Frequency") fig.savefig(args.out+'.fragmentsizes.eps') plt.close(fig)
def run_occ(args): """run occupancy calling """ if args.fasta: chrs = read_chrom_sizes_from_fasta(args.fasta) else: chrs = read_chrom_sizes_from_bam(args.bam) pwm = PWM.open(args.pwm) chunks = ChunkList.read(args.bed, chromDict = chrs, min_offset = args.flank + args.upper/2 + max(pwm.up,pwm.down) + args.nuc_sep/2) chunks.slop(chrs, up = args.nuc_sep/2, down = args.nuc_sep/2) chunks.merge() maxQueueSize = args.cores*10 fragment_dist = FragmentMixDistribution(0, upper = args.upper) if args.sizes is not None: tmp = FragmentSizes.open(args.sizes) fragment_dist.fragmentsizes = FragmentSizes(0, args.upper, vals = tmp.get(0,args.upper)) else: fragment_dist.getFragmentSizes(args.bam, chunks) fragment_dist.modelNFR() fragment_dist.plotFits(args.out + '.occ_fit.eps') fragment_dist.fragmentsizes.save(args.out + '.fragmentsizes.txt') params = OccupancyParameters(fragment_dist, args.upper, args.fasta, args.pwm, sep = args.nuc_sep, min_occ = args.min_occ, flank = args.flank, bam = args.bam, ci = args.confidence_interval, step = args.step) sets = chunks.split(items = args.cores * 5) pool1 = mp.Pool(processes = max(1,args.cores-1)) out_handle1 = open(args.out + '.occ.bedgraph','w') out_handle1.close() out_handle2 = open(args.out + '.occ.lower_bound.bedgraph','w') out_handle2.close() out_handle3 = open(args.out + '.occ.upper_bound.bedgraph','w') out_handle3.close() write_queue = mp.JoinableQueue(maxsize = maxQueueSize) write_process = mp.Process(target = _writeOcc, args=(write_queue, args.out)) write_process.start() peaks_handle = open(args.out + '.occpeaks.bed','w') peaks_handle.close() peaks_queue = mp.JoinableQueue() peaks_process = mp.Process(target = _writePeaks, args=(peaks_queue, args.out)) peaks_process.start() nuc_dist = np.zeros(args.upper) for j in sets: tmp = pool1.map(_occHelper, zip(j,itertools.repeat(params))) for result in tmp: nuc_dist += result[0] write_queue.put(result[1]) peaks_queue.put(result[2]) pool1.close() pool1.join() write_queue.put('STOP') peaks_queue.put('STOP') write_process.join() peaks_process.join() pysam.tabix_compress(args.out + '.occpeaks.bed', args.out + '.occpeaks.bed.gz',force = True) shell_command('rm ' + args.out + '.occpeaks.bed') pysam.tabix_index(args.out + '.occpeaks.bed.gz', preset = "bed", force = True) for i in ('occ','occ.lower_bound','occ.upper_bound'): pysam.tabix_compress(args.out + '.' + i + '.bedgraph', args.out + '.'+i+'.bedgraph.gz',force = True) shell_command('rm ' + args.out + '.' + i + '.bedgraph') pysam.tabix_index(args.out + '.' + i + '.bedgraph.gz', preset = "bed", force = True) dist_out = FragmentSizes(0, args.upper, vals = nuc_dist) dist_out.save(args.out + '.nuc_dist.txt') print "Making figure" #make figure fig = plt.figure() plt.plot(range(0,args.upper),dist_out.get(0,args.upper),label = "Nucleosome Distribution") plt.xlabel("Fragment Size") plt.ylabel("Frequency") fig.savefig(args.out+'.nuc_dist.eps') plt.close(fig)
def run_nuc(args): """run occupancy calling """ vmat = VMat.open(args.vmat) if args.fasta: chrs = read_chrom_sizes_from_fasta(args.fasta) else: chrs = read_chrom_sizes_from_bam(args.bam) pwm = PWM.open(args.pwm) chunks = ChunkList.read(args.bed, chromDict=chrs, min_offset=vmat.mat.shape[1] + vmat.upper // 2 + max(pwm.up, pwm.down) + args.nuc_sep // 2, min_length=args.nuc_sep * 2) chunks.slop(chrs, up=args.nuc_sep // 2, down=args.nuc_sep // 2) chunks.merge() maxQueueSize = args.cores * 10 if args.sizes is not None: fragment_dist = FragmentSizes.open(args.sizes) else: fragment_dist = FragmentSizes(0, upper=vmat.upper) fragment_dist.calculateSizes(args.bam, chunks) params = NucParameters(vmat=vmat, fragmentsizes=fragment_dist, bam=args.bam, fasta=args.fasta, pwm=args.pwm, occ_track=args.occ_track, sd=args.sd, nonredundant_sep=args.nuc_sep, redundant_sep=args.redundant_sep, min_z=args.min_z, min_lr=args.min_lr, atac=args.atac) sets = chunks.split(items=args.cores * 5) pool1 = mp.Pool(processes=max(1, args.cores - 1)) if args.write_all: outputs = [ 'nucpos', 'nucpos.redundant', 'nucleoatac_signal', 'nucleoatac_signal.smooth', 'nucleoatac_background', 'nucleoatac_raw' ] else: outputs = [ 'nucpos', 'nucpos.redundant', 'nucleoatac_signal', 'nucleoatac_signal.smooth' ] handles = {} write_queues = {} write_processes = {} for i in outputs: if i not in ['nucpos', 'nucpos.redundant', 'nfrpos']: handles[i] = open(args.out + '.' + i + '.bedgraph', 'w') else: handles[i] = open(args.out + '.' + i + '.bed', 'w') handles[i].close() write_queues[i] = mp.JoinableQueue(maxsize=maxQueueSize) write_processes[i] = mp.Process(target=_writeFuncs[i], args=(write_queues[i], args.out)) write_processes[i].start() for j in sets: tmp = pool1.map(_nucHelper, list(zip(j, itertools.repeat(params)))) for result in tmp: for i in outputs: write_queues[i].put(result[i]) pool1.close() pool1.join() for i in outputs: write_queues[i].put('STOP') for i in outputs: write_processes[i].join() if i not in ['nucpos', 'nucpos.redundant']: pysam.tabix_compress(args.out + '.' + i + '.bedgraph', args.out + '.' + i + '.bedgraph.gz', force=True) shell_command('rm ' + args.out + '.' + i + '.bedgraph') pysam.tabix_index(args.out + '.' + i + '.bedgraph.gz', preset="bed", force=True) else: pysam.tabix_compress(args.out + '.' + i + '.bed', args.out + '.' + i + '.bed.gz', force=True) shell_command('rm ' + args.out + '.' + i + '.bed') pysam.tabix_index(args.out + '.' + i + '.bed.gz', preset="bed", force=True)
def run_nuc(args): """run occupancy calling """ vmat = VMat.open(args.vmat) if args.fasta: chrs = read_chrom_sizes_from_fasta(args.fasta) else: chrs = read_chrom_sizes_from_bam(args.bam) pwm = PWM.open(args.pwm) chunks = ChunkList.read(args.bed, chromDict = chrs, min_offset = vmat.mat.shape[1] + vmat.upper/2 + max(pwm.up,pwm.down) + args.nuc_sep/2, min_length = args.nuc_sep * 2) chunks.slop(chrs, up = args.nuc_sep/2, down = args.nuc_sep/2) chunks.merge() maxQueueSize = args.cores*10 if args.sizes is not None: fragment_dist = FragmentSizes.open(args.sizes) else: fragment_dist = FragmentSizes(0, upper = vmat.upper) fragment_dist.calculateSizes(args.bam, chunks) params = NucParameters(vmat = vmat, fragmentsizes = fragment_dist, bam = args.bam, fasta = args.fasta, pwm = args.pwm, occ_track = args.occ_track, sd = args.sd, nonredundant_sep = args.nuc_sep, redundant_sep = args.redundant_sep, min_z = args.min_z, min_lr = args.min_lr , atac = args.atac) sets = chunks.split(items = args.cores*5) pool1 = mp.Pool(processes = max(1,args.cores-1)) if args.write_all: outputs = ['nucpos','nucpos.redundant','nucleoatac_signal','nucleoatac_signal.smooth', 'nucleoatac_background','nucleoatac_raw'] else: outputs = ['nucpos','nucpos.redundant','nucleoatac_signal','nucleoatac_signal.smooth'] handles = {} write_queues = {} write_processes = {} for i in outputs: if i not in ['nucpos','nucpos.redundant','nfrpos']: handles[i] = open(args.out + '.'+i+'.bedgraph','w') else: handles[i] = open(args.out + '.'+i+'.bed','w') handles[i].close() write_queues[i] = mp.JoinableQueue(maxsize = maxQueueSize) write_processes[i] = mp.Process(target = _writeFuncs[i], args=(write_queues[i], args.out)) write_processes[i].start() for j in sets: tmp = pool1.map(_nucHelper, zip(j,itertools.repeat(params))) for result in tmp: for i in outputs: write_queues[i].put(result[i]) pool1.close() pool1.join() for i in outputs: write_queues[i].put('STOP') for i in outputs: write_processes[i].join() if i not in ['nucpos','nucpos.redundant']: pysam.tabix_compress(args.out + '.' + i + '.bedgraph', args.out + '.' + i + '.bedgraph.gz',force = True) shell_command('rm ' + args.out + '.' + i + '.bedgraph') pysam.tabix_index(args.out + '.' + i + '.bedgraph.gz', preset = "bed", force = True) else: pysam.tabix_compress(args.out + '.' + i + '.bed', args.out + '.' + i + '.bed.gz',force = True) shell_command('rm ' + args.out + '.' + i + '.bed') pysam.tabix_index(args.out + '.' + i + '.bed.gz', preset = "bed", force = True)
class FragmentMixDistribution: """Class for modelling insert size distribution""" def __init__(self, lower = 0, upper =2000): self.lower = lower self.upper = upper def getFragmentSizes(self, bamfile, chunklist = None): self.fragmentsizes = FragmentSizes(self.lower, self.upper) self.fragmentsizes.calculateSizes(bamfile, chunks = chunklist) def modelNFR(self, boundary = 115): """Model NFR distribution with exponential distribution""" b = np.where(self.fragmentsizes.get(self.lower,boundary) == max(self.fragmentsizes.get(self.lower,boundary)))[0][0]+10 + self.lower def exp_pdf(x,*p): #defines the PDF k=p[0] a=p[1] x=x-b return a*k*np.exp(-k*x) x = np.array(range(b,boundary)) p0 = (.1,1) coeff, var_matrix = optimize.curve_fit(exp_pdf,x, self.fragmentsizes.get(b,boundary), p0=p0) nfr = np.concatenate((self.fragmentsizes.get(self.lower,boundary), exp_pdf(np.array(range(boundary,self.upper)),*coeff))) nfr[nfr==0] = min(nfr[nfr!=0])*0.01 self.nfr_fit = FragmentSizes(self.lower,self.upper, vals = nfr) nuc = np.concatenate((np.zeros(boundary-self.lower), self.fragmentsizes.get(boundary,self.upper) - self.nfr_fit.get(boundary,self.upper))) nuc[nuc<=0]=min(min(nfr)*0.1,min(nuc[nuc>0])*0.001) self.nuc_fit = FragmentSizes(self.lower, self.upper, vals = nuc) def plotFits(self,filename=None): """plot the Fits""" fig = plt.figure() plt.plot(range(self.lower,self.upper),self.fragmentsizes.get(), label = "Observed") #plt.plot(range(self.lower,self.upper),self.smoothed.get(), label = "Smoothed") plt.plot(range(self.lower,self.upper),self.nuc_fit.get(), label = "Nucleosome Fit") plt.plot(range(self.lower,self.upper),self.nfr_fit.get(), label = "NFR Fit") plt.legend() plt.xlabel("Fragment size") plt.ylabel("Relative Frequency") if filename: fig.savefig(filename) plt.close(fig) #Also save text output! filename2 = ".".join(filename.split(".")[:-1]+['txt']) out = np.vstack((self.fragmentsizes.get(), #self.smoothed.get(), self.nuc_fit.get(), self.nfr_fit.get())) np.savetxt(filename2,out,delimiter="\t") else: fig.show()
class FragmentMixDistribution: """Class for modelling insert size distribution""" def __init__(self, lower=0, upper=2000): self.lower = lower self.upper = upper def getFragmentSizes(self, bamfile, chunklist=None): self.fragmentsizes = FragmentSizes(self.lower, self.upper) self.fragmentsizes.calculateSizes(bamfile, chunks=chunklist) def modelNFR(self, boundary=115): """Model NFR distribution with exponential distribution""" b = np.where( self.fragmentsizes.get(self.lower, boundary) == max( self.fragmentsizes.get(self.lower, boundary)))[0][0] + 10 + self.lower def exp_pdf(x, *p): #defines the PDF k = p[0] a = p[1] x = x - b return a * k * np.exp(-k * x) x = np.array(range(b, boundary)) p0 = (.1, 1) coeff, var_matrix = optimize.curve_fit(exp_pdf, x, self.fragmentsizes.get( b, boundary), p0=p0) nfr = np.concatenate((self.fragmentsizes.get(self.lower, boundary), exp_pdf(np.array(range(boundary, self.upper)), *coeff))) nfr[nfr == 0] = min(nfr[nfr != 0]) * 0.01 self.nfr_fit = FragmentSizes(self.lower, self.upper, vals=nfr) nuc = np.concatenate((np.zeros(boundary - self.lower), self.fragmentsizes.get(boundary, self.upper) - self.nfr_fit.get(boundary, self.upper))) nuc[nuc <= 0] = min(min(nfr) * 0.1, min(nuc[nuc > 0]) * 0.001) self.nuc_fit = FragmentSizes(self.lower, self.upper, vals=nuc) def plotFits(self, filename=None): """plot the Fits""" fig = plt.figure() plt.plot(range(self.lower, self.upper), self.fragmentsizes.get(), label="Observed") #plt.plot(range(self.lower,self.upper),self.smoothed.get(), label = "Smoothed") plt.plot(range(self.lower, self.upper), self.nuc_fit.get(), label="Nucleosome Fit") plt.plot(range(self.lower, self.upper), self.nfr_fit.get(), label="NFR Fit") plt.legend() plt.xlabel("Fragment size") plt.ylabel("Relative Frequency") if filename: fig.savefig(filename) plt.close(fig) #Also save text output! filename2 = ".".join(filename.split(".")[:-1] + ['txt']) out = np.vstack(( self.fragmentsizes.get(), #self.smoothed.get(), self.nuc_fit.get(), self.nfr_fit.get())) np.savetxt(filename2, out, delimiter="\t") else: fig.show()
class FragmentMixDistribution: """Class for modelling insert size distribution""" def __init__(self, lower = 0, upper =2000): self.lower = lower self.upper = upper def getFragmentSizes(self, bamfile, chunklist = None): self.fragmentsizes = FragmentSizes(self.lower, self.upper) self.fragmentsizes.calculateSizes(bamfile, chunks = chunklist) def modelNFR(self, boundaries = (35,115)): """Model NFR distribution with gamma distribution""" b = np.where(self.fragmentsizes.get(self.lower,boundaries[1]) == max(self.fragmentsizes.get(self.lower,boundaries[1])))[0][0] + self.lower boundaries = (min(boundaries[0],b), boundaries[1]) x = np.arange(boundaries[0],boundaries[1]) y = self.fragmentsizes.get(boundaries[0],boundaries[1]) def gamma_fit(X,o,p): k = p[0] theta = p[1] a = p[2] x_mod = X-o res = np.zeros(len(x_mod)) if k>=1: nz = x_mod >= 0 else: nz = x_mod > 0 res[nz] = a * x_mod[nz]**(k-1) * np.exp(-x_mod[nz]/theta) / (theta **k * gamma(k)) return res res_score = np.ones(boundaries[0]+1)*np.float('inf') res_param = [0 for i in range(boundaries[0]+1)] pranges = ((0.01,10),(0.01,150),(0.01,1)) for i in range(15,boundaries[0]+1): f = lambda p: np.sum((gamma_fit(x,i,p) - y)**2) tmpres = optimize.brute(f, pranges, full_output=True, finish=optimize.fmin) res_score[i] = tmpres[1] res_param[i] = tmpres[0] whichres = np.argmin(res_score) res = res_param[whichres] self.nfr_fit0 = FragmentSizes(self.lower,self.upper, vals = gamma_fit(np.arange(self.lower,self.upper),whichres,res_param[whichres])) nfr = np.concatenate((self.fragmentsizes.get(self.lower,boundaries[1]), self.nfr_fit0.get(boundaries[1],self.upper))) nfr[nfr==0] = min(nfr[nfr!=0])*0.01 self.nfr_fit = FragmentSizes(self.lower,self.upper, vals = nfr) nuc = np.concatenate((np.zeros(boundaries[1]-self.lower), self.fragmentsizes.get(boundaries[1],self.upper) - self.nfr_fit.get(boundaries[1],self.upper))) nuc[nuc<=0]=min(min(nfr)*0.1,min(nuc[nuc>0])*0.001) self.nuc_fit = FragmentSizes(self.lower, self.upper, vals = nuc) def plotFits(self,filename=None): """plot the Fits""" fig = plt.figure() plt.plot(range(self.lower,self.upper),self.fragmentsizes.get(), label = "Observed") plt.plot(range(self.lower,self.upper),self.nfr_fit0.get(), label = "NFR Fit") plt.plot(range(self.lower,self.upper),self.nuc_fit.get(), label = "Nucleosome Model") plt.plot(range(self.lower,self.upper),self.nfr_fit.get(), label = "NFR Model") plt.legend() plt.xlabel("Fragment size") plt.ylabel("Relative Frequency") if filename: fig.savefig(filename) plt.close(fig) #Also save text output! filename2 = ".".join(filename.split(".")[:-1]+['txt']) out = np.vstack((self.fragmentsizes.get(), #self.smoothed.get(), self.nuc_fit.get(), self.nfr_fit.get())) np.savetxt(filename2,out,delimiter="\t") else: fig.show()
def getFragmentSizes(self, bamfile, chunklist = None): self.fragmentsizes = FragmentSizes(self.lower, self.upper) self.fragmentsizes.calculateSizes(bamfile, chunks = chunklist)