Exemplo n.º 1
0
def launch_coveragebed(bamfilenames, bedfilename, legend, outdir, executiongranted):
	global TMP
	
	coveragefiles = []
	Pcoveragebeds = []	
	pid = str(os.getpid())
	
	for i,bamfilename in enumerate(bamfilenames):	
		coveragefile = TMP+'/'+os.path.basename(bamfilename).replace('.bam','.'+pid+'.coverage')
		coveragebedgraph = outdir+'/data/'+legend[i].replace('.bam', '.bed')
		
		print 'Coveragefile = '+coveragefile	
		bam = bam_file.bam_file(bamfilename, 'rb')
		
		print 'Launching coverageBed...'
		Pcoveragebed = multiprocessing.Process(target=bam.myCoverageBed, args=(bedfilename, None, coveragefile, executiongranted,TMP,coveragebedgraph,))
		Pcoveragebed.start()
		
	#	[positions,coverage,chromosomes,processedbed] = bam.myCoverageBed(bedfilename, bam.nreads())
		print '	Done.'
		
		coveragefiles.append(coveragefile)
		Pcoveragebeds.append(Pcoveragebed)
	
#	return [positions,coverage,chromosomes,processedbed]
	return [Pcoveragebeds,coveragefiles]
Exemplo n.º 2
0
	def intersectbam(self, bam):
		"""************************************************************************************************************************************************************
		Task: IntersectBam
		Inputs:
			bam: Bam_file type 
		************************************************************************************************************************************************************"""
		pid = str(os.getpid())
		newbam = TMP+'/'+pid+'.intersect.bam'
		self.run(BEDTOOLS+"intersectBed -abam "+bam.filename+" -b "+self.filename+" > "+newbam)	   
		pysam.index(newbam)
		
		return bam_file.bam_file(newbam,"rb")
Exemplo n.º 3
0
def launch_onoff_reads(bamfilenames, bedfilename, legend, outdir, executiongranted):
	
	global TMP
	
	onoff_status = multiprocessing.Value('b', False)
	duplicates_status = multiprocessing.Value('b', False)
	enrichment = multiprocessing.Array('f', len(bamfilenames))
	percontarget = multiprocessing.Array('f', len(bamfilenames))
	onduplicates = multiprocessing.Array('f', len(bamfilenames))
	offduplicates = multiprocessing.Array('f', len(bamfilenames))
	
	bam = bam_file.bam_file(bamfilenames[0], 'rb')
	print 'Launching on/off target enrichment calculation...'
	Ponoff_reads = multiprocessing.Process(target=bam.reads_on_target, args=(bedfilename,outdir,[bam_file.bam_file(bamfilenames[i]) for i in range(1,len(bamfilenames))],
																			 legend,executiongranted,onoff_status,duplicates_status,onduplicates,
																			 offduplicates,enrichment,percontarget,TMP,config.warnontarget,))
	Ponoff_reads.start()	
	bam.close()
	

	return Ponoff_reads,onoff_status,onduplicates,offduplicates,duplicates_status,enrichment,percontarget
def exon_coverage_std(groups, fileoutprefix, bedfilename, legend=None, normalize=True):
    """************************************************************************************************************************************************************
    Task: generates the distribution of coverage standard deviation across exons.
    Inputs:
        groups: list of sublists. Each sublist contains bam filenames of samples related somehow, e.g. samples sequenced in the same run.    
        fileoutprefix: String containing the fileout prefix.
        bedfilename: string containing the name of the bed with the regions to analyze.
        legend: list of strings containing descriptions describing each of the groups that will be processed. These descriptions will form the legend of the bar plot.    
        normalize: {True, False} to indicate whether bam files should be normalized
    Output: two .png figures are generated. One containing the distributions of coverage standard deviation across exons
        and a box plot of such distributions.
    ************************************************************************************************************************************************************"""
       
    minsize = 1000000000000000
    minbamfilename = None
    bamgroups = []
    # Process each group and draw the corresponding histogram in the graph    
    for colouridx,filelist in enumerate(groups):
        bamlist = [] 
        # Samples std for each exon in current file
        for filename in filelist:                
            # Check indexing of the bam file, needed for pysam use
            if(not os.path.isfile(filename+'.bai') and not os.path.isfile(filename.replace('.bam','.bai'))):
                print 'WARNING: index not found for '+filename+'. Indexing...'
                pysam.index(filename)
                print '    Done.'
    
            bam = bam_file.bam_file(filename, 'rb')
            bamlist.append(bam)
            
            # Find the bam with the minimum number of reads
            if(bam.nreads() < minsize):
                minsize = bam.nreads()
                minbamfilename = bam.filename
                
        bamgroups.append(bamlist)                    
    
    print 'The smaller bam is '+minbamfilename+' and contains '+str(minsize)+' reads.'
        
    fig = pyplot.figure(figsize=(13,6))
    ax = fig.add_subplot(111)
    boxplot = pyplot.figure()
    axb = boxplot.add_subplot(111)
    
    rects = []
    colours = ['#ff0000', '#00ff00', '#0000ff', '#cc0011', '#007722', '#110066']
    global_stdsampling = []

    # Process each group and draw the corresponding histogram in the graph    
    for colouridx,filelist in enumerate(bamgroups):
        # Samples std for each exon in current file
        for bam in filelist:
            print '    '+bam.filename
            
            # Check whether normalization should be applied
            if(normalize): normalizedbam = bam.normalize(minsize)
            else: normalizedbam = bam
                    
            std_sampling = normalizedbam.region_coverage_std(bedfilename)

#        print '# exons < 0.028 - '+legend[colouridx]+': '+str(len((numpy.array(std_sampling)<0.028).nonzero()[0]))
        bins = numpy.arange(0, 1, 0.007)
        rects.append(ax.hist(std_sampling, bins, alpha=0.5, facecolor=colours[colouridx])[2])
        std_sampling = numpy.array(std_sampling)
        global_stdsampling.append(list(numpy.log10(std_sampling[(std_sampling>0)])))

                                                                    
    # add some
    fig.suptitle('Distribution of coverage standard deviations (normalized) across exons', fontsize=14, fontweight='bold')
    ax.set_ylabel('Frequency')
    ax.set_xlabel('Normalized standard deviation')
    ax.set_xlim(0, 1)
    
    boxplot.suptitle('Distribution of coverage standard deviations (normalized) across exons', fontsize=14, fontweight='bold')
    axb.boxplot(global_stdsampling)    

    # Check whether graph legend should be included or not
    if(legend<>None): 
        axb.set_xticklabels(legend)                

        # Shink current axis by 20%       
        box = ax.get_position()
        ax.set_position([box.x0, box.y0, box.width * 0.8, box.height])

        # Add graphic legend
        ax.legend( tuple([rect[0] for rect in rects]), tuple(legend), loc="upper left", bbox_to_anchor=(1,1) )
        
    fig.savefig(fileoutprefix+'/std_distribution.png')
    matplotlib.pyplot.close(fig)
    boxplot.savefig(fileoutprefix+'/std_boxplot.png')
    matplotlib.pyplot.close(boxplot)
    print '>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> Finished <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<'
def coverage_saturation_local(bamlist, targets, depthlist, coverage, legend, fileout, executiongranted=None, status=None, slopes=None, tmpdir=None, warnthreshold=1e-5):
    """************************************************************************************************************************************************************
    Task: calculates and draws coverage saturation plots for a list of samples. Just the same as the one below but in multithreading mode.
    Inputs:
        bamlist: list of strings with the names of the bams to process.
        targets: list of strings with the names of the beds containing the targets for each run.
        depthlist: list of integers containing the run depths to test (millions of reads).
        legend: list of descriptions describing each of the files that will be processed. These descriptions will form the legend of the bar plot.
        fileout: String containing the name of the file where the plot will be saved.
    Outputs:       
    ************************************************************************************************************************************************************"""

    # Check whether a temporary directory is provided as an argument
    if(tmpdir<>None):
        TMP = tmpdir
        
    pid = str(os.getpid())
    simulated_depth_processes = []
#    executiongranted = multiprocessing.Semaphore(2)
    
    # Launches one thread for each sample and depth for calculating % of covered positions
    result_files = []
    for i,bam in enumerate(bamlist):
        
        # Check whether there is an index for current bam
        if(not os.path.isfile(bam+'.bai') and not os.path.isfile(bam.replace('.bam','.bai'))):
            print 'WARNING: index not found for '+bam+'. Indexing...'
            pysam.index(bam)
            
        # Threads are launched for each bam and depth point. If provided depth values are greater than the number of reads in the bam file, the maximum depth
        # value to be used will be the number of reads in the bam and no more threads will be launched.
        nreads_bam=bam_file.bam_file(bam).nreads()
        sorteddepths = depthlist
        sorteddepths.sort()
        if(nreads_bam>=(sorteddepths[1]*1000000)):
            endreached=False
            j=0
            while(j<len(depthlist) and not endreached):
                depth = depthlist[j]
                # If a legend is provided, use it to differentiate job ids
                if(legend<>None):
                    jobid = 'coverage_'+pid+'_'+str(depth)+'_'+legend[i].lower()
                else:
                    jobid = 'coverage_'+pid+'_'+str(depth)+'_'+os.path.basename(bamlist[i])
            
                print "Submitting depth "+str(depth)+", file "+bam   
                
                # Activate the flag to indicate that following depth values are greater than the number of reads in the bam
                if((depth*1000000)>=nreads_bam):
                    endreached = True
                
    #            queue.wait()            
                newprocess = multiprocessing.Process(target=simulated_depth.simulated_depth, args=(bam,targets[i],depth,coverage,TMP+'/'+jobid,executiongranted,
                                                                                                   TMP,))
                simulated_depth_processes.append(newprocess)
                newprocess.start()
    #            queue.push(newprocess)            
                
                result_files.append(TMP+'/'+jobid)
                j += 1           
        else:
            print 'WARNING: the number of reads in '+str(bam)+' is '+str(nreads_bam)
            print '    The set of depths provided for coverage saturation calculus is 10e6*'+str(depthlist)
            print '    At least two depths equal or lower than the number of mapped reads are required.' 
            
    if(len(simulated_depth_processes)>0):
        # Wait for all the processess to finish
        for process in simulated_depth_processes:
            process.join()
            process.terminate()
            
        print 'Submitting draw saturation curve...'
        slope_status,tmpslopes = draw_saturation_curve.draw_saturation_curve(result_files,'% covered positions',fileout,legend,warnthreshold=warnthreshold)
        
        if(slopes<>None):
            for i,slope in enumerate(tmpslopes):
                slopes[i] = slope
        
        # Calculate status flag as an OR among the flags for each bam file
        if(status<>None):
            status.value = (sum(slope_status)==len(bamlist))
        
    
        # Remove temporary files
        for afile in result_files: os.remove(afile)
    else:
        status.value = False
Exemplo n.º 6
0
def ngscat(bamfilenames, originalbedfilename, outdir, reference=None, saturation=False, nthreads=2, extend=None, depthlist='auto', coveragethresholds=[1,5,10,20,30],
		   onefeature=None, tmpdir=None):
	
	global TMP
	
	if(tmpdir<>None):
		if(os.path.isdir(tmpdir) or os.path.islink(tmpdir)):
			TMP = tmpdir
		else:
			print 'ERROR: temporary directory '+tmpdir+' does not exist.'
			print '	Exiting'
			sys.exit(1)
		
	if(not (os.path.isdir(outdir)  or os.path.islink(outdir))):
		print 'WARNING: '+outdir+' does not exist. Creating directory.'
		os.mkdir(outdir)

	if(not (os.path.isdir(outdir+'/data') or os.path.islink(outdir+'/data'))):		
		print 'Creating '+outdir+'/data'
		os.mkdir(outdir+'/data')

	if(not (os.path.isdir(outdir+'/img') or os.path.islink(outdir+'/img'))):
		print 'Creating '+outdir+'/img'
		os.mkdir(outdir+'/img')
		
	sortedbams = []
	for bamfilename in bamfilenames:
		filelink = TMP+'/'+os.path.basename(bamfilename)
		try:
			os.symlink(bamfilename, filelink)
		except OSError:
			print 'WARNING: when trying to create a symbolic link at the temporary directory pointing to '+bamfilename+', a file named '+filelink+' was already found.'
			print '	Probably the temporary and origin directories are the same. The only problem this could cause is that the new index overwrites an existing one.'
			print '	Continue (y/n)?'
			
			goahead = raw_input()
			if(goahead=='n' or goahead=='N'):
				print 'Exiting...'
				sys.exit(1)
			elif(goahead<>'y' and goahead<>'Y'):
				print 'Unknown choice '+goahead
				print 'Exiting...'
				sys.exit(1)
				
			if(os.path.dirname(bamfilename)<>os.path.dirname(TMP+'/')):
				os.remove(filelink)
				os.symlink(bamfilename, filelink)
		
		print 'Indexing...'
		pysam.index(filelink)
		print '	Done.'
			
		if(not bam_file.bam_file(filelink).issorted()):
			print 'WARNING: '+bamfilename+' is not sorted'
			print 'Sorting...'
			pid = str(time.time())
			newsortedbam = TMP+'/'+pid+'.sorted'
			sortedbams.append(newsortedbam+'.bam')
			pysam.sort(filelink, newsortedbam)
			print 'Indexing...'
			pysam.index(sortedbams[-1])		
			
			print '	Done.'			
		else:
			sortedbams.append(filelink)						
	

	if(saturation and depthlist=='auto'):
		maxdepth = max([bam_file.bam_file(bamfilename).nreads() for bamfilename in sortedbams])
		depthlist = numpy.arange(maxdepth/5.0, maxdepth+(maxdepth/5.0)-1, maxdepth/5.0)
		depthlist = depthlist/1000000.0
		
		
	legend = [os.path.basename(bamfilename) for bamfilename in bamfilenames]
	executiongranted = multiprocessing.Semaphore(nthreads)

	if(extend<>None): 
		bedfilename = TMP+'/'+originalbedfilename.replace('.bed','.'+pid+'.extended.bed')
		bed_file.bed_file(originalbedfilename).extendnoref(extend,bedfilename)
	else:
		bedfilename = originalbedfilename
			
	if(onefeature==None or onefeature<>'saturation' or onefeature<>'specificity'):			
		Pcoveragebeds,coveragefiles = launch_coveragebed(sortedbams, bedfilename, legend, outdir, executiongranted)
			
	if((saturation and onefeature==None) or onefeature=='saturation'):					
		Psaturation,coverage_saturation_status,saturationslopes = launch_coverage_saturation(sortedbams, bedfilename, depthlist, legend, outdir+'/data/', executiongranted)
	else:
		coverage_saturation_status = None
		saturationslopes = None
		

	if(onefeature==None or onefeature=='specificity'):
		Ponoff_reads,onoff_status,onduplicates,offduplicates,duplicates_status,enrichment,percontarget = launch_onoff_reads(sortedbams, bedfilename, legend, outdir+'/data/', executiongranted)	

	for i in range(len(Pcoveragebeds)):
		Pcoveragebeds[i].join()
		Pcoveragebeds[i].terminate()

	if(onefeature==None or onefeature=='specificity'):
		Poffclusters = launch_offclusters(glob.glob(outdir+'/data/*.bed'), bedfilename, executiongranted)	

	if(onefeature==None or onefeature=='coveragefreq'):
		Pcoveragedistribution,coveragedistribution_status,meancoverage = launch_coverage_distribution(coveragefiles, outdir+'/data/', legend, executiongranted)	

	if(onefeature==None or onefeature=='percbases'):
		Pcoveredpositions,coveredpositions_status,coveredbases = launch_covered_positions(coveragefiles, coveragethresholds, outdir+'/data/', legend, executiongranted)

	if(onefeature==None or onefeature=='coveragedistr'):
		Pcoveragethroughtarget,throughtarget_status,lowcovbases = launch_coverage_through_target(coveragefiles, outdir+'/data/', legend, executiongranted)

	if(len(coveragefiles)>1 and (onefeature==None or onefeature=='coveragecorr')):
		Pcoveragecorr,coveragecorr_status,corr = launch_coveragecorr(coveragefiles, outdir+'/data/coveragecorr.png', legend, executiongranted)
	else:
		coveragecorr_status = None
		corr = None

	if(onefeature==None or onefeature=='coveragestd'):	
		Pcoveragestd,coveragestd_status,coveragestd = launch_coverage_std(coveragefiles, outdir+'/data/', legend, executiongranted)

	if((reference<>None and onefeature==None) or onefeature=='gcbias'):
		Pgcbias = []
		for i,coveragefile in enumerate(coveragefiles):
			onePgcbias,gcbias_status = launch_gcbias(coveragefile, bedfilename, reference, outdir+'/data/gcbias'+str(i)+'.png', legend[i], executiongranted)
			Pgcbias.append(onePgcbias)
		for onePgcbias in Pgcbias:
			onePgcbias.join()
			onePgcbias.terminate()
	else:
		gcbias_status = None
		
	# LAUNCH BASIC STATS

	if((saturation and onefeature==None) or onefeature=='saturation'):
		Psaturation.join()
		Psaturation.terminate()		

	if(onefeature==None or onefeature=='coveragefreq'):
		Pcoveragedistribution.join()
		Pcoveragedistribution.terminate()
		
	if(onefeature==None or onefeature=='percbases'):
		Pcoveredpositions.join()
		Pcoveredpositions.terminate()
	
	if(onefeature==None or onefeature=='coveragedistr'):		
		Pcoveragethroughtarget.join()
		Pcoveragethroughtarget.terminate()

	if(len(coveragefiles)>1 and (onefeature==None or onefeature=='coveragecorr')):
		Pcoveragecorr.join()
		Pcoveragecorr.terminate()
		
	if(onefeature==None or onefeature=='coveragestd'):
		Pcoveragestd.join()
		Pcoveragestd.terminate()
		
	if(onefeature==None or onefeature=='specificity'):
		Ponoff_reads.join()
		Ponoff_reads.terminate()
	
		Poffclusters.join()
		Poffclusters.terminate()
		



		
#	if(onefeature==None or onefeature<>'saturation'):
#		for coveragefile in coveragefiles:
#			os.remove(coveragefile)
	
	if(onefeature==None):
		generate_report(bamfilenames,sortedbams,originalbedfilename,outdir,coveredpositions_status,coveredbases,coverage_saturation_status,saturationslopes,
						onoff_status,
						duplicates_status,onduplicates,offduplicates,coveragedistribution_status,meancoverage,
						coveragecorr_status,corr,throughtarget_status,lowcovbases,coveragestd_status,coveragestd,gcbias_status,enrichment,percontarget,
						reference,nthreads,depthlist,
						coveragethresholds)
Exemplo n.º 7
0
def generate_report(bamfilenames,sortedbams,bedfilename,outdir,coveredpositions_status,coveredbases,coverage_saturation_status,saturationslopes,onoff_status,
					duplicates_status,onduplicates,offduplicates,coveragedistribution_status,meancoverage,
					coveragecorr_status,corr,throughtarget_status,lowcovbases,coveragestd_status,coveragestd,gcbias_status,enrichment,percontarget,
					reference,nthreads,
					depthlist,
					coveragethresholds):
	
	global TMP
	
	shutil.copy(IMGSRC+'/xls_icon.png', outdir+'/img')
	shutil.copy(IMGSRC+'/txt_icon.png', outdir+'/img')
	shutil.copy(IMGSRC+'/ok.jpg', outdir+'/img')
	shutil.copy(IMGSRC+'/warning.jpg', outdir+'/img')
	shutil.copy(IMGSRC+'/coverage_histogram_example.png', outdir+'/img')

	shutil.copy(DATASRC+'/styles.css', outdir)
	
	
	# ********************************************************* INput parameters ******************************************************************
	if(coverage_saturation_status<>None):
		saturationcurve = 'Yes'
	else:
		saturationcurve = 'No'
		
	fd = file(DATASRC+'/captureQC.html')
	reportcontent = string.join(fd.readlines(),sep='').replace('bamfilename', string.join(bamfilenames, sep=', ')).replace('bedfilename',bedfilename).replace('reportdate', time.ctime()).replace('reference',str(reference)).replace('saturationcurve',saturationcurve).replace('nthreads',str(nthreads)).replace('tmpdir',TMP)
	fd.close()



	
	
	# ********************************************************* Result summary ******************************************************************
		
	jsonstr = ''
	for i,bam in enumerate(bamfilenames):
		jsonstr += '{"bamfile":"'+bam+'"'
		jsonstr += ',"nreads":'+str(bam_file.bam_file(sortedbams[i]).nreads())
		jsonstr += ',"coveredbases":'+str(coveredbases[i])
		
		if(coverage_saturation_status<>None):
			jsonstr += ',"saturationslope":'+str(saturationslopes[i])
			
		jsonstr += ',"percontarget":'+str(percontarget[i])
		jsonstr += ',"onduplicates":'+str(onduplicates[i])
		jsonstr += ',"offduplicates":'+str(offduplicates[i])
		jsonstr += ',"meancoverage":'+str(meancoverage[i])
		jsonstr += ',"lowcovbases":'+str(lowcovbases[i])
		
		if(not math.isnan(coveragestd[i])):
			jsonstr += ',"coveragestd":'+str(coveragestd[i])+'}'
		else:
			jsonstr +='}'
		
	fd = file(outdir+'/data/summary.json', 'w')
	fd.write(jsonstr)
	fd.close()
		
	summaryrows = ''	
	for i,bam in enumerate(bamfilenames):
		summaryrows += '<tr>\n'
		summaryrows += '<td class="table-cell"> '+bam+'</td>'
		summaryrows += '<td class="table-cell"> '+str(bam_file.bam_file(sortedbams[i]).nreads())+' </td>'
		summaryrows += '<td class="table-cell">%.1f'%(coveredbases[i])+'% </td>'
		
		if(coverage_saturation_status<>None):
			summaryrows += '<td class="table-cell">%.1e</td>\n'%saturationslopes[i]
			
		summaryrows += '<td class="table-cell">%.1f'%(percontarget[i])+'% </td>\n'
		summaryrows += ('<td class="table-cell">ON-%.1f%%'%onduplicates[i])+'; OFF: %.1f'%(offduplicates[i])+'% </td>'
		summaryrows += '<td class="table-cell">%.1fx'%meancoverage[i]+'</td>\n'
		summaryrows += '<td class="table-cell">%d consecutive bases<br>with coverage <= <WARNCOVERAGETHRESHOLD></td>\n'%(lowcovbases[i])
		
		if(coveragecorr_status<>None):
			summaryrows += '<td class="table-cell">%.2f</td>\n'%corr.value
			
		summaryrows += '<td class="table-cell">%.2f</td>\n'%coveragestd[i]
		summaryrows += '</tr>\n'
		
	summarystatus = '<td class="table-header">Overall status</td>\n'
	summarystatus += '<td class="table-header"></td>\n'	
	summarystatus += '<td class="table-header"><a href="#targetbases"><img src="img/<TARGETBASESSTATUS>.jpg" height=23px /></a></td>\n'
	if(coverage_saturation_status<>None):
		summarystatus += '<td class="table-header"><a href="#coveragesaturation"><img src="img/<COVERAGESATURATIONSTATUS>.jpg" height=23px /></a></td>\n'
	summarystatus += '<td class="table-header"><a href="#onoff"><img src="img/<ONOFFSTATUS>.jpg" height=23px /></a></td>\n'
	summarystatus += '<td class="table-header"><a href="#dup"><img src="img/<DUPSTATUS>.jpg" height=23px /></a></td>\n'
	summarystatus += '<td class="table-header"><a href="#distribution"><img src="img/<DISTRIBUTIONSTATUS>.jpg" height=23px /></a></td>\n'
	summarystatus += '<td class="table-header"><a href="#coveragethroughtarget"><img src="img/<COVERAGETHROUGHTARGETSTATUS>.jpg" height=23px /></a></td>\n'
	if(coveragecorr_status<>None):
		summarystatus += '<td class="table-header"><a href="#coveragecorr"><img src="img/<COVERAGECORRSTATUS>.jpg" height=23px /></a></td>\n'
	summarystatus += '<td class="table-header"><a href="#coveragestd"><img src="img/<COVERAGESTDSTATUS>.jpg" height=23px /></a></td>\n'
		
	reportcontent = reportcontent.replace('<SUMMARYROWS>',summaryrows)
	reportcontent = reportcontent.replace('<SUMMARYSTATUS>',summarystatus)	
	
	if(coverage_saturation_status<>None):				
		reportcontent = reportcontent.replace('<SUMMARYSATURATION>','<td class="table-header"><a href="#coveragesaturation">Coverage saturation<br>(slope at the end of the curve)</a></td>')
	else:
		reportcontent = reportcontent.replace('<SUMMARYSATURATION>','')
		
	if(coveragecorr_status<>None):	
		reportcontent = reportcontent.replace('<SUMMARYCOVCORRELATION>','<td class="table-header"><a href="#coveragecorr">Coverage correlation<br>per ROI</a></td>')
	else:
		reportcontent = reportcontent.replace('<SUMMARYCOVCORRELATION>','')
			
	reportcontent = reportcontent.replace('<SUMMARYCOVERAGETHRS>',str(coveragethresholds[0]))
	reportcontent = reportcontent.replace('<SUMMARYTARGETSIZE>',str(bed_file.bed_file(bedfilename).size()))
		
	
	
	
	
	
	
	# ********************************************************* Detailed results ******************************************************************
	chromosomeimages = ''
	ontarget_coverage_files = glob.glob(outdir+'/data/*_Ontarget_Coverage.png')
	ontarget_coverage_files.sort()
	for afile in ontarget_coverage_files:
		chromosomeimages += '<a href="data/'+os.path.basename(afile)+'"><img style="width: 33%; float: left;" src="data/'+os.path.basename(afile)+'" /></a>'
	reportcontent = reportcontent.replace('<CHROMOSOMEIMAGES>',chromosomeimages)
		
	if(coveredpositions_status.value):
		reportcontent = reportcontent.replace('<TARGETBASESSTATUS>','ok')
	else:
		reportcontent = reportcontent.replace('<TARGETBASESSTATUS>','warning')
	reportcontent = reportcontent.replace('<WARNBASESCOVERED>',str(config.warnbasescovered))

	percentagestr = '\n<ul>'
	enrichmentstr = '\n<ul>'
	for i,bamfilename in enumerate(bamfilenames):
		percentagestr += '<li>'+bamfilename+': %.1f'%(percontarget[i])+'%</li>\n'
		enrichmentstr += '<li>'+bamfilename+': %.1f'%(enrichment[i])+'</li>\n'
	percentagestr += '</ul>'
	enrichmentstr += '</ul>'
	reportcontent = reportcontent.replace('<PERCENTAGEONTARGET>', percentagestr)
	reportcontent = reportcontent.replace('<ENRICHMENT>', enrichmentstr)
	
	reportcontent = reportcontent.replace('<WARNONTARGET>', str(config.warnontarget))
	if(onoff_status.value):
		reportcontent = reportcontent.replace('<ONOFFSTATUS>','ok')
	else:
		reportcontent = reportcontent.replace('<ONOFFSTATUS>','warning')
		

	duplicates_files = glob.glob(outdir+'/data/duplicates*.png')
	duplicates_files.sort()
	dupimages = ''
	for afile in duplicates_files:
		dupimages += '<img style="width: 50%; float: left;" src="data/'+os.path.basename(afile)+'" /></a>'
	reportcontent = reportcontent.replace('<DUPIMAGES>',dupimages)

	if(duplicates_status.value):
		reportcontent = reportcontent.replace('<DUPSTATUS>','ok')
	else:
		reportcontent = reportcontent.replace('<DUPSTATUS>','warning')

	reportcontent = reportcontent.replace('<WARNMEANCOVERAGE>',str(config.warnmeancoverage))
	if(coveragedistribution_status.value):
		reportcontent = reportcontent.replace('<DISTRIBUTIONSTATUS>','ok')
	else:
		reportcontent = reportcontent.replace('<DISTRIBUTIONSTATUS>','warning')

	if(coveragecorr_status<>None):
		fd = file(DATASRC+'/coveragecorr_content.html')
		coveragecorr_content = string.join(fd.readlines(), sep='')
		fd.close()
		reportcontent = reportcontent.replace('<COVERAGECORRCONTENT>',coveragecorr_content)				

		reportcontent = reportcontent.replace('<WARNCOVERAGECORRELATION>',str(config.warncoveragecorrelation))
		if(coveragecorr_status.value):
			reportcontent = reportcontent.replace('<COVERAGECORRSTATUS>','ok')
		else:
			reportcontent = reportcontent.replace('<COVERAGECORRSTATUS>','warning')
	else:										
		reportcontent = reportcontent.replace('<COVERAGECORRCONTENT>','\n')
		
	reportcontent = reportcontent.replace('<WARNCOVERAGEREGION>',str(config.warncoverageregion))
	reportcontent = reportcontent.replace('<WARNCOVERAGETHRESHOLD>',str(config.warncoveragethreshold))
	if(throughtarget_status.value):
		reportcontent = reportcontent.replace('<COVERAGETHROUGHTARGETSTATUS>','ok')
	else:
		reportcontent = reportcontent.replace('<COVERAGETHROUGHTARGETSTATUS>','warning')

	reportcontent = reportcontent.replace('<WARNSTD>',str(config.warnstd))
	if(coveragestd_status.value):
		reportcontent = reportcontent.replace('<COVERAGESTDSTATUS>','ok')
	else:
		reportcontent = reportcontent.replace('<COVERAGESTDSTATUS>','warning')

	if(coverage_saturation_status<>None):
		fd = file(DATASRC+'/saturation_content.html')
		saturation_content = string.join(fd.readlines(), sep='')
		fd.close()		
		reportcontent = reportcontent.replace('<SATURATIONCONTENT>',saturation_content).replace('<DEPTHLIST>',string.join(map(str,depthlist[:-1]),sep='x10<sup>6</sup>, ')+'x10<sup>6</sup> and '+str(depthlist[-1])+'x10<sup>6</sup>').replace('depthlist',str(depthlist)[1:-1])
		reportcontent = reportcontent.replace('<WARNSATURATION>',str(config.warnsaturation))
		
		if(coverage_saturation_status.value):
			reportcontent = reportcontent.replace('<COVERAGESATURATIONSTATUS>','ok')
		else:
			reportcontent = reportcontent.replace('<COVERAGESATURATIONSTATUS>','warning')
	else:
		reportcontent = reportcontent.replace('<SATURATIONCONTENT>','\n').replace('depthlist','None')

	reportcontent = reportcontent.replace('coveragethrs', string.join(map(str, coveragethresholds), sep=', '))
	
	if(gcbias_status<>None):
		fd = file(DATASRC+'/gcbias_content.html')
		gcbias_content = string.join(fd.readlines(), sep='')
		fd.close()
		reportcontent = reportcontent.replace('<GCBIASCONTENT>',gcbias_content)

		gcbiasimages = ''
		for afile in glob.glob(outdir+'/data/gcbias*.png'):
			gcbiasimages += '<img style="width:40%" src="data/'+os.path.basename(afile)+'" />'		
		reportcontent = reportcontent.replace('<GCBIASIMAGES>', gcbiasimages)
			
	else:
		reportcontent = reportcontent.replace('<GCBIASCONTENT>','\n')

	fd = file(outdir+'/captureQC.html', 'w')
	fd.write(reportcontent)
	fd.close()
	
	print 'Results written at '+outdir
def coverage_distribution(bams,beds,dirout,labels,normalize):
    """************************************************************************************************************************************************************
    Task: calculates coverage distribution for a set of bam and bed (capture) files.
    Inputs:
        bams: list of strings with the paths to the bam files.
        beds: list of strings with the paths to the bed files.
        dirout: string containing the full path to the directory where results will be stored.
        labels: list of strings with the labels to name each sample (bam) in the graph.
        normalize: {True,False} to indicate whether normalization should be applied.                
    Output: <dirout>/Coverage_histo.png with the coverage histogram, <dirout>/Coverage_boxp.png with the boxplots and <dirout>/Coverage_stats.xls with quartiles,
        mean, maximum and minimum values.
    ************************************************************************************************************************************************************"""
    
    # Chek output directory exists. In case not, create it
    if(not os.path.isdir(dirout)):
        print 'WARNING: directory '+dirout+' does not exist. Creating...'
        os.mkdir(dirout)
       
    distributions = []
    bamlist = [] 
    # Indexes each bam file and creates the corresponding bam_file objects
    for i,bamfilename in enumerate(bams):
        # Check indexing of the bam file, needed for pysam use
        if(not os.path.isfile(bamfilename+'.bai') and not os.path.isfile(bamfilename.replace('.bam','.bai'))):
            print 'WARNING: index not found for '+bamfilename+'. Indexing...'
            pysam.index(bamfilename)
            print '    Done.'

        bam = bam_file.bam_file(bamfilename, 'rb')
        bamlist.append(bam)
        
    sizes = numpy.array([bam.nreads() for bam in bamlist])
    minsize = sizes.min()
    
    print 'The smaller bam is '+bamlist[sizes.argmin()].filename+' and contains '+str(minsize)+' reads.'
        
    # Process each file and store counting results
    print 'Counting covered bases...'
    for i,bam in enumerate(bamlist):
        print '    '+bam.filename
        
        # Check whether normalization should be applied
        if(normalize): normalizedbam = bam.normalize(minsize)
        else: normalizedbam = bam
            
        distributions.append(normalizedbam.get_coverage_distribution(beds[i]))
                
    draw_histogram(distributions, labels, dirout)
    draw_boxplot(distributions, labels, dirout)
        
    # Initialize the workbook and sheet
    wb = xlwt.Workbook()
    ws = wb.add_sheet('Coverage distribution')

    # Create header font
    header_style = xlwt.easyxf('font: bold on')    

    ws.write(0,0,'Sample',header_style);ws.write(0,1,'Q1',header_style);ws.write(0,2,'Q2',header_style);ws.write(0,3,'Q3',header_style);
    ws.write(0,4,'Max. coverage',header_style); ws.write(0,5,'Min. coverage',header_style);ws.write(0,6,'Mean coverage',header_style);
    
    # Calculate distribution stats for each of the bams
    for i,dist in enumerate(distributions):

        #Sacamos estadisticas
        ndist=numpy.array(dist)
        p25=numpy.percentile(ndist, 25)
        p50=numpy.percentile(ndist, 50)
        p75=numpy.percentile(ndist, 75)

        maximum=numpy.max(ndist)
        minimum=numpy.min(ndist)
        mean=numpy.average(ndist)
    
        ws.write(i+1,0,labels[i]);ws.write(i+1,1,p25);ws.write(i+1,2,p50);ws.write(i+1,3,p75);ws.write(i+1,4,maximum);ws.write(i+1,5,minimum);ws.write(i+1,6,mean);

    wb.save(dirout+'/Coverage_stats.xls')
Exemplo n.º 9
0
def simulated_depth(bam,
                    target,
                    depth,
                    coveragethreshold,
                    fileout,
                    executiongranted=None,
                    tmpdir=None):
    """************************************************************************************************************************************************************
    Task: randomly selects a number of reads from a given bam and calculates target coverage. 
    Inputs:
        pipelinehome: String containing the home dir where pipeline output is stored. E.g.: /data/pipeline_outputs/solid/parana/11847_2012-09-14_bfast_190408/
        target: String containing the full path to the bed file.
        depth: Integer containing the run depth in number of reads (millions).
        fileout: String containing the name of the file where results will be stored.
    Output: generates a text file (fileout) with a tab separated line: <dept>\t<ncovered positions>\t<%covered positions>                     
    ************************************************************************************************************************************************************"""

    global TMP

    if (tmpdir <> None):
        TMP = tmpdir

    if (executiongranted <> None):
        executiongranted.acquire()

    pid = str(os.getpid())

    bam = bam_file.bam_file(bam, 'rb')
    [positions, coverage, chromosomes,
     processedbed] = bam.myCoverageBed(target, depth * 1000000, tmpdir=TMP)

    #    totalregions = sum([len(processedbed.chrs[chr]) for chr in processedbed.chrs])

    # A progress bar is initialized
    print 'Loading coverage...'
    #    widgets = ['Progress: ', progressbar.Percentage(), ' ',
    #                progressbar.Bar(marker=progressbar.RotatingMarker()), ' ', progressbar.ETA()]
    #    pbar = progressbar.ProgressBar(widgets=widgets, maxval=totalregions).start()

    nregions = 0
    npositions = 0
    ncovered_positions = 0
    for chr in processedbed.chrs:
        positionsidx = chromosomes[chr][0]
        for i, region in enumerate(processedbed.chrs[chr]):
            npositions += (region[1] - region[0] + 1)
            while ((positionsidx + 1) <= chromosomes[chr][1]
                   and positions[positionsidx + 1] <= region[1]):
                if (coverage[positionsidx] >= coveragethreshold):
                    ncovered_positions += (positions[positionsidx + 1] -
                                           positions[positionsidx])
                positionsidx += 1

            if (coverage[positionsidx] >= coveragethreshold):
                ncovered_positions += (region[1] - positions[positionsidx] + 1)

            positionsidx += 1

            nregions += 1


#            pbar.update(nregions)

#    pbar.finish()

    print 'Writing results at ' + fileout + ' ...'
    fd = file(fileout, 'w')
    fd.write(os.path.basename(bam.filename) + '\n')
    fd.write(
        str(min(bam.nreads(), depth * 1000000)) + '\t' +
        str(ncovered_positions) + '\t' +
        str(ncovered_positions * 100.0 / npositions))
    fd.close()
    print '    Done.'

    if (executiongranted <> None):
        executiongranted.release()
Exemplo n.º 10
0
def gcbias(filelist, fileoutlist, bedfilelist):
	"""************************************************************************************************************************************************************
	Task: draws coverage as a function of gc content
	Input:
		filelist: list of strings, each containing the full path of the bam file to analyze.
		fileoutlist: list of strings, each containing the full path of the png file where the corresponding figure will be saved.
		bedfilelist: 
	Output: a bmp file will be created named "fileout" where a graph that compares gc content and mean coverage will be saved.	
	************************************************************************************************************************************************************"""
	
	pid = str(os.getpid())
	
	numpy.random.seed(1)
	ntotal_positions = []
	bamlist = []
	
	# Process each file and store counting results
	for filename in filelist:
		# Check whether index already exists for the bam file, needed for pysam use
		if(not os.path.isfile(filename+'.bai')):
			print 'Creating index for '+filename
			pysam.index(filename)
			print '	Done.'
						
		bamlist.append(bam_file.bam_file(filename))
	sizes = numpy.array([bam.nreads() for bam in bamlist])
	minsize = sizes.min()
	
	print 'The smaller bam is '+filelist[sizes.argmin()]+' and contains '+str(minsize)+' reads.'
		
	# Process each file and store counting results
	for i,bamfile in enumerate(bamlist):
	
		print 'Processing '+bamfile.filename
		print 'Results will be written at '+fileoutlist[i]
		
		# Check whether normalization should be run
		if(normalize): normalizedbam = bamfile.normalize(minsize)
		else: normalizedbam = bamfile
		
		coveragefile = TMP+'/'+pid+'.coverage'
		print 'Calculating coverage per position...'
		run(BEDTOOLSPATH+'coverageBed -d -abam '+normalizedbam.filename+' -b '+bedfilelist[i]+' > '+coveragefile)   
	
		coverage = region_coverage(coveragefile)
	
		print 'Calculating nt content...'
		bedfd = pybedtools.BedTool(bedfilelist[i])
		pybedtools._bedtools_installed = True
		pybedtools.set_bedtools_path(BEDTOOLSPATH)	
		ntcontent = bedfd.nucleotide_content(REF)
		
		# Each entry in ntcontent is parsed to extract the gc content of each exon
		gccontent = {}
		for entry in ntcontent:
			gccontent[(entry.fields[0], string.atoi(entry.fields[1]), string.atoi(entry.fields[2]))] = string.atof(entry.fields[-8])*100
		print '	Done.'		
			
		fig = pyplot.figure(figsize=(13,6))
		ax = fig.add_subplot(111)
		
		region_ids = coverage.keys()
		coveragearray = numpy.array([coverage[id] for id in region_ids])
		gccontentarray = numpy.array([gccontent[id] for id in region_ids]) # Values in [0,1]
	
		xmin = gccontentarray.min()
		xmax = gccontentarray.max() # Due to the imshow sentence, we need to rescale gccontent from [0,1] to [0,100]
		ymin = coveragearray.min()
		ymax = coveragearray.max()
		 
		# Perform a kernel density estimator on the results
		X, Y = mgrid[xmin:xmax:100j, ymin:ymax:100j]
		positions = c_[X.ravel(), Y.ravel()]
		values = c_[gccontentarray, coveragearray]
		kernel = stats.kde.gaussian_kde(values.T)
		Z = reshape(kernel(positions.T).T, X.T.shape)
		
		
		fig = pyplot.figure(figsize=(6,6))
		ax = fig.add_subplot(111)
		sc=ax.imshow(rot90(Z),cmap=cm.gist_earth_r,extent=[xmin, 100, ymin, ymax], aspect="auto") # Due to the imshow sentence, we need to rescale gccontent from [0,1] to [0,100]
		cbar=fig.colorbar(sc,ticks=[numpy.min(Z),numpy.max(Z)])
		cbar.ax.set_yticklabels(['Low','High'])
		cbar.set_label('Density')
		ax.set_xlabel('GC content (%)')
		ax.set_ylabel('Mean coverage')
		fig.savefig(fileoutlist[i])
		matplotlib.pyplot.close(fig)
	
	print '>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> Finished <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<'
Exemplo n.º 11
0
def gcbias(filelist, fileoutlist, bedfilelist):
    """************************************************************************************************************************************************************
	Task: draws coverage as a function of gc content
	Input:
		filelist: list of strings, each containing the full path of the bam file to analyze.
		fileoutlist: list of strings, each containing the full path of the png file where the corresponding figure will be saved.
		bedfilelist: 
	Output: a bmp file will be created named "fileout" where a graph that compares gc content and mean coverage will be saved.	
	************************************************************************************************************************************************************"""

    pid = str(os.getpid())

    numpy.random.seed(1)
    ntotal_positions = []
    bamlist = []

    # Process each file and store counting results
    for filename in filelist:
        # Check whether index already exists for the bam file, needed for pysam use
        if (not os.path.isfile(filename + '.bai')):
            print 'Creating index for ' + filename
            pysam.index(filename)
            print '	Done.'

        bamlist.append(bam_file.bam_file(filename))
    sizes = numpy.array([bam.nreads() for bam in bamlist])
    minsize = sizes.min()

    print 'The smaller bam is ' + filelist[
        sizes.argmin()] + ' and contains ' + str(minsize) + ' reads.'

    # Process each file and store counting results
    for i, bamfile in enumerate(bamlist):

        print 'Processing ' + bamfile.filename
        print 'Results will be written at ' + fileoutlist[i]

        # Check whether normalization should be run
        if (normalize): normalizedbam = bamfile.normalize(minsize)
        else: normalizedbam = bamfile

        coveragefile = TMP + '/' + pid + '.coverage'
        print 'Calculating coverage per position...'
        run(BEDTOOLSPATH + 'coverageBed -d -abam ' + normalizedbam.filename +
            ' -b ' + bedfilelist[i] + ' > ' + coveragefile)

        coverage = region_coverage(coveragefile)

        print 'Calculating nt content...'
        bedfd = pybedtools.BedTool(bedfilelist[i])
        pybedtools._bedtools_installed = True
        pybedtools.set_bedtools_path(BEDTOOLSPATH)
        ntcontent = bedfd.nucleotide_content(REF)

        # Each entry in ntcontent is parsed to extract the gc content of each exon
        gccontent = {}
        for entry in ntcontent:
            gccontent[(entry.fields[0], string.atoi(
                entry.fields[1]), string.atoi(
                    entry.fields[2]))] = string.atof(entry.fields[-8]) * 100
        print '	Done.'

        fig = pyplot.figure(figsize=(13, 6))
        ax = fig.add_subplot(111)

        region_ids = coverage.keys()
        coveragearray = numpy.array([coverage[id] for id in region_ids])
        gccontentarray = numpy.array([gccontent[id]
                                      for id in region_ids])  # Values in [0,1]

        xmin = gccontentarray.min()
        xmax = gccontentarray.max(
        )  # Due to the imshow sentence, we need to rescale gccontent from [0,1] to [0,100]
        ymin = coveragearray.min()
        ymax = coveragearray.max()

        # Perform a kernel density estimator on the results
        X, Y = mgrid[xmin:xmax:100j, ymin:ymax:100j]
        positions = c_[X.ravel(), Y.ravel()]
        values = c_[gccontentarray, coveragearray]
        kernel = stats.kde.gaussian_kde(values.T)
        Z = reshape(kernel(positions.T).T, X.T.shape)

        fig = pyplot.figure(figsize=(6, 6))
        ax = fig.add_subplot(111)
        sc = ax.imshow(
            rot90(Z),
            cmap=cm.gist_earth_r,
            extent=[xmin, 100, ymin, ymax],
            aspect="auto"
        )  # Due to the imshow sentence, we need to rescale gccontent from [0,1] to [0,100]
        cbar = fig.colorbar(sc, ticks=[numpy.min(Z), numpy.max(Z)])
        cbar.ax.set_yticklabels(['Low', 'High'])
        cbar.set_label('Density')
        ax.set_xlabel('GC content (%)')
        ax.set_ylabel('Mean coverage')
        fig.savefig(fileoutlist[i])
        matplotlib.pyplot.close(fig)

    print '>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> Finished <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<'
def simulated_depth(bam, target, depth, coveragethreshold, fileout, executiongranted=None, tmpdir=None):
    """************************************************************************************************************************************************************
    Task: randomly selects a number of reads from a given bam and calculates target coverage. 
    Inputs:
        pipelinehome: String containing the home dir where pipeline output is stored. E.g.: /data/pipeline_outputs/solid/parana/11847_2012-09-14_bfast_190408/
        target: String containing the full path to the bed file.
        depth: Integer containing the run depth in number of reads (millions).
        fileout: String containing the name of the file where results will be stored.
    Output: generates a text file (fileout) with a tab separated line: <dept>\t<ncovered positions>\t<%covered positions>                     
    ************************************************************************************************************************************************************"""
    
    global TMP
    
    if(tmpdir<>None):
        TMP = tmpdir
        
    if(executiongranted<>None):
        executiongranted.acquire()
        
    pid = str(os.getpid())
        
    bam = bam_file.bam_file(bam, 'rb')
    [positions,coverage,chromosomes,processedbed] = bam.myCoverageBed(target,depth*1000000, tmpdir=TMP)
    
#    totalregions = sum([len(processedbed.chrs[chr]) for chr in processedbed.chrs])
    
    # A progress bar is initialized
    print 'Loading coverage...'
#    widgets = ['Progress: ', progressbar.Percentage(), ' ', 
#                progressbar.Bar(marker=progressbar.RotatingMarker()), ' ', progressbar.ETA()]
#    pbar = progressbar.ProgressBar(widgets=widgets, maxval=totalregions).start() 

    nregions = 0
    npositions = 0
    ncovered_positions = 0
    for chr in processedbed.chrs:
        positionsidx = chromosomes[chr][0]        
        for i,region in enumerate(processedbed.chrs[chr]):
            npositions += (region[1]-region[0]+1)
            while((positionsidx+1)<=chromosomes[chr][1] and positions[positionsidx+1]<=region[1]):
                if(coverage[positionsidx] >= coveragethreshold): 
                    ncovered_positions += (positions[positionsidx+1]-positions[positionsidx])
                positionsidx += 1
                
            if(coverage[positionsidx] >= coveragethreshold): 
                ncovered_positions += (region[1]-positions[positionsidx]+1)
                
            positionsidx += 1
            
            nregions += 1
#            pbar.update(nregions)
            
#    pbar.finish()
                 
    
    print 'Writing results at '+fileout+' ...'
    fd = file(fileout, 'w')
    fd.write(os.path.basename(bam.filename)+'\n')
    fd.write(str(min(bam.nreads(),depth*1000000))+'\t'+str(ncovered_positions)+'\t'+str(ncovered_positions*100.0/npositions))
    fd.close()
    print '    Done.'

    if(executiongranted<>None):
        executiongranted.release()
Exemplo n.º 13
0
def coverage_saturation_local(bamlist,
                              targets,
                              depthlist,
                              coverage,
                              legend,
                              fileout,
                              executiongranted=None,
                              status=None,
                              slopes=None,
                              tmpdir=None,
                              warnthreshold=1e-5):
    """************************************************************************************************************************************************************
    Task: calculates and draws coverage saturation plots for a list of samples. Just the same as the one below but in multithreading mode.
    Inputs:
        bamlist: list of strings with the names of the bams to process.
        targets: list of strings with the names of the beds containing the targets for each run.
        depthlist: list of integers containing the run depths to test (millions of reads).
        legend: list of descriptions describing each of the files that will be processed. These descriptions will form the legend of the bar plot.
        fileout: String containing the name of the file where the plot will be saved.
    Outputs:       
    ************************************************************************************************************************************************************"""

    # Check whether a temporary directory is provided as an argument
    if (tmpdir <> None):
        TMP = tmpdir

    pid = str(os.getpid())
    simulated_depth_processes = []
    #    executiongranted = multiprocessing.Semaphore(2)

    # Launches one thread for each sample and depth for calculating % of covered positions
    result_files = []
    for i, bam in enumerate(bamlist):

        # Check whether there is an index for current bam
        if (not os.path.isfile(bam + '.bai')
                and not os.path.isfile(bam.replace('.bam', '.bai'))):
            print 'WARNING: index not found for ' + bam + '. Indexing...'
            pysam.index(bam)

        # Threads are launched for each bam and depth point. If provided depth values are greater than the number of reads in the bam file, the maximum depth
        # value to be used will be the number of reads in the bam and no more threads will be launched.
        nreads_bam = bam_file.bam_file(bam).nreads()
        sorteddepths = depthlist
        sorteddepths.sort()
        if (nreads_bam >= (sorteddepths[1] * 1000000)):
            endreached = False
            j = 0
            while (j < len(depthlist) and not endreached):
                depth = depthlist[j]
                # If a legend is provided, use it to differentiate job ids
                if (legend <> None):
                    jobid = 'coverage_' + pid + '_' + str(
                        depth) + '_' + legend[i].lower()
                else:
                    jobid = 'coverage_' + pid + '_' + str(
                        depth) + '_' + os.path.basename(bamlist[i])

                print "Submitting depth " + str(depth) + ", file " + bam

                # Activate the flag to indicate that following depth values are greater than the number of reads in the bam
                if ((depth * 1000000) >= nreads_bam):
                    endreached = True

    #            queue.wait()
                newprocess = multiprocessing.Process(
                    target=simulated_depth.simulated_depth,
                    args=(
                        bam,
                        targets[i],
                        depth,
                        coverage,
                        TMP + '/' + jobid,
                        executiongranted,
                        TMP,
                    ))
                simulated_depth_processes.append(newprocess)
                newprocess.start()
                #            queue.push(newprocess)

                result_files.append(TMP + '/' + jobid)
                j += 1
        else:
            print 'WARNING: the number of reads in ' + str(bam) + ' is ' + str(
                nreads_bam)
            print '    The set of depths provided for coverage saturation calculus is 10e6*' + str(
                depthlist)
            print '    At least two depths equal or lower than the number of mapped reads are required.'

    if (len(simulated_depth_processes) > 0):
        # Wait for all the processess to finish
        for process in simulated_depth_processes:
            process.join()
            process.terminate()

        print 'Submitting draw saturation curve...'
        slope_status, tmpslopes = draw_saturation_curve.draw_saturation_curve(
            result_files,
            '% covered positions',
            fileout,
            legend,
            warnthreshold=warnthreshold)

        if (slopes <> None):
            for i, slope in enumerate(tmpslopes):
                slopes[i] = slope

        # Calculate status flag as an OR among the flags for each bam file
        if (status <> None):
            status.value = (sum(slope_status) == len(bamlist))

        # Remove temporary files
        for afile in result_files:
            os.remove(afile)
    else:
        status.value = False
def target_coverage(filelist, targetfiles, coveragelist, graph_legend, outprefix, xticklabels=None, normalize=False):
    """************************************************************************************************************************************************************
    Task: draws statistics about the percentage of covered exons and transcripts at different coverage levels. A transcript is considered to be covered when
        at least the 90% of its positions present a coverage greater than the threshold.
    Inputs:
        filelist: list of strings indicating those files to be processed. For a file format example see
            /home/javi/MGP/capture_methods/data/coverage/GU_20120719_FC1_6L1S_AL_01_3376_BC1_AA_F3.filtered.singleHits.realigned.recalibrated.bam.coverage
        coveragelist: list of values with coverage thresholds to use.
        graph_legend: list of descriptions describing each of the files that will be processed. These descriptions will form the legend of the bar plot. These 
            labels will also be used to identify sample replicates. Replicates will be merged in one bar in the bar plot.
        outprefix: string containing the full path to the directory where data will be saved.
        xticklabels: list of strings with labels for the ticks in the x axis.
        normalize: boolean to indicate whether bam files should be normalized or not.        
    Output: a summary .xls file and a bar plot depicting coverage vs. %covered-positions. Figures will be saved as
        <dirout>/coverage_summary.xls, <dirout>/covered_positions.png
    ************************************************************************************************************************************************************"""

    numpy.random.seed(1)
    covered_positions = []
    ntotal_positions = []
    bamlist = []

    # Process each file and store counting results
    for filename in filelist:
        # Check whether index already exists for the bam file, needed for pysam use
        if not os.path.isfile(filename + ".bai"):
            print "Creating index for " + filename
            pysam.index(filename)
            print "    Done."

        bamlist.append(bam_file.bam_file(filename))
    sizes = numpy.array([bam.nreads() for bam in bamlist])
    minsize = sizes.min()

    print "The smaller bam is " + filelist[sizes.argmin()] + " and contains " + str(minsize) + " reads."

    # Process each file and store counting results
    print "Counting covered bases..."
    for i, bam in enumerate(bamlist):
        print "    " + bam.filename

        # Check whether normalization should be run
        if normalize:
            normalizedbam = bam.normalize(minsize)
        else:
            normalizedbam = bam

        ntotal_positions_tmp, covered_positions_per_depth = normalizedbam.target_coverage(coveragelist, targetfiles[i])
        covered_positions.append(covered_positions_per_depth)
        ntotal_positions.append(ntotal_positions_tmp)

    # Initialize the workbook and sheet
    wb = xlwt.Workbook()
    ws = wb.add_sheet("Bases")

    # Create header font
    header_style = xlwt.easyxf("font: bold on")

    for i, cov in enumerate(coveragelist):
        ws.write(0, i * 2 + 1, "Coverage >=" + str(cov) + "x", style=header_style)
        ws.write(0, i * 2 + 1 + 1, "%", style=header_style)

    # Write count of covered positions in each file for each coverage threshold
    for i, value_list in enumerate(covered_positions):
        # Use graph legend elements for row identifiers
        if graph_legend <> None:
            ws.write(i + 1, 0, graph_legend[i], style=header_style)
        else:
            ws.write(i + 1, 0, os.path.basename(filelist[i]), style=header_style)

        # Write counts for current file
        for j, value in enumerate(value_list):
            ws.write(i + 1, j * 2 + 1, value)
            ws.write(i + 1, j * 2 + 1 + 1, value * 100.0 / ntotal_positions[i])

    # Calculate percentage of covered positions. Pass through the results of each file.
    for i in range(len(covered_positions)):
        # Divide each count by the total number of positions
        for j, value in enumerate(covered_positions[i]):
            covered_positions[i][j] = value * 100.0 / ntotal_positions[i]

    # Check whether the output directory is already created
    if not os.path.isdir(os.path.dirname(outprefix)):
        print "WARNING: directory " + os.path.dirname(outprefix) + " not found. Creating new directory."
        os.mkdir(os.path.dirname(outprefix))

    # If x labels are not provided, generate ad hoc labels
    if xticklabels == None:
        xticklabels = [">=" + str(cov) + "x" for cov in coveragelist]

    # Save .xls file and generate the two bar plots.
    wb.save(outprefix + "coverage_summary.xls")
    draw_graph_wreplicates(
        outprefix + "covered_positions.png",
        covered_positions,
        xticklabels,
        "Coverage threshold",
        "% covered positions",
        graph_legend,
    )
def exon_coverage_std(groups, fileoutprefix, bedfilename, legend=None, normalize=True):
    """************************************************************************************************************************************************************
    Task: generates the distribution of coverage standard deviation across exons.
    Inputs:
        groups: list of sublists. Each sublist contains bam filenames of samples related somehow, e.g. samples sequenced in the same run.    
        fileoutprefix: String containing the fileout prefix.
        bedfilename: string containing the name of the bed with the regions to analyze.
        legend: list of strings containing descriptions describing each of the groups that will be processed. These descriptions will form the legend of the bar plot.    
        normalize: {True, False} to indicate whether bam files should be normalized
    Output: two .png figures are generated. One containing the distributions of coverage standard deviation across exons
        and a box plot of such distributions.
    ************************************************************************************************************************************************************"""

    minsize = 1000000000000000
    minbamfilename = None
    bamgroups = []
    # Process each group and draw the corresponding histogram in the graph
    for colouridx, filelist in enumerate(groups):
        bamlist = []
        # Samples std for each exon in current file
        for filename in filelist:
            # Check indexing of the bam file, needed for pysam use
            if not os.path.isfile(filename + ".bai") and not os.path.isfile(filename.replace(".bam", ".bai")):
                print "WARNING: index not found for " + filename + ". Indexing..."
                pysam.index(filename)
                print "    Done."

            bam = bam_file.bam_file(filename, "rb")
            bamlist.append(bam)

            # Find the bam with the minimum number of reads
            if bam.nreads() < minsize:
                minsize = bam.nreads()
                minbamfilename = bam.filename

        bamgroups.append(bamlist)

    print "The smaller bam is " + minbamfilename + " and contains " + str(minsize) + " reads."

    fig = pyplot.figure(figsize=(13, 6))
    ax = fig.add_subplot(111)
    boxplot = pyplot.figure()
    axb = boxplot.add_subplot(111)

    rects = []
    colours = ["#ff0000", "#00ff00", "#0000ff", "#cc0011", "#007722", "#110066"]
    global_stdsampling = []

    # Process each group and draw the corresponding histogram in the graph
    for colouridx, filelist in enumerate(bamgroups):
        # Samples std for each exon in current file
        for bam in filelist:
            print "    " + bam.filename

            # Check whether normalization should be applied
            if normalize:
                normalizedbam = bam.normalize(minsize)
            else:
                normalizedbam = bam

            std_sampling = normalizedbam.region_coverage_std(bedfilename)

        #        print '# exons < 0.028 - '+legend[colouridx]+': '+str(len((numpy.array(std_sampling)<0.028).nonzero()[0]))
        bins = numpy.arange(0, 1, 0.007)
        rects.append(ax.hist(std_sampling, bins, alpha=0.5, facecolor=colours[colouridx])[2])
        std_sampling = numpy.array(std_sampling)
        global_stdsampling.append(list(numpy.log10(std_sampling[(std_sampling > 0)])))

    # add some
    fig.suptitle(
        "Distribution of coverage standard deviations (normalized) across exons", fontsize=14, fontweight="bold"
    )
    ax.set_ylabel("Frequency")
    ax.set_xlabel("Normalized standard deviation")
    ax.set_xlim(0, 1)

    boxplot.suptitle(
        "Distribution of coverage standard deviations (normalized) across exons", fontsize=14, fontweight="bold"
    )
    axb.boxplot(global_stdsampling)

    # Check whether graph legend should be included or not
    if legend <> None:
        axb.set_xticklabels(legend)

        # Shink current axis by 20%
        box = ax.get_position()
        ax.set_position([box.x0, box.y0, box.width * 0.8, box.height])

        # Add graphic legend
        ax.legend(tuple([rect[0] for rect in rects]), tuple(legend), loc="upper left", bbox_to_anchor=(1, 1))

    fig.savefig(fileoutprefix + "/std_distribution.png")
    matplotlib.pyplot.close(fig)
    boxplot.savefig(fileoutprefix + "/std_boxplot.png")
    matplotlib.pyplot.close(boxplot)
    print ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> Finished <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<"
def coverage_distribution(bams, beds, dirout, labels, normalize):
    """************************************************************************************************************************************************************
    Task: calculates coverage distribution for a set of bam and bed (capture) files.
    Inputs:
        bams: list of strings with the paths to the bam files.
        beds: list of strings with the paths to the bed files.
        dirout: string containing the full path to the directory where results will be stored.
        labels: list of strings with the labels to name each sample (bam) in the graph.
        normalize: {True,False} to indicate whether normalization should be applied.                
    Output: <dirout>/Coverage_histo.png with the coverage histogram, <dirout>/Coverage_boxp.png with the boxplots and <dirout>/Coverage_stats.xls with quartiles,
        mean, maximum and minimum values.
    ************************************************************************************************************************************************************"""

    # Chek output directory exists. In case not, create it
    if (not os.path.isdir(dirout)):
        print 'WARNING: directory ' + dirout + ' does not exist. Creating...'
        os.mkdir(dirout)

    distributions = []
    bamlist = []
    # Indexes each bam file and creates the corresponding bam_file objects
    for i, bamfilename in enumerate(bams):
        # Check indexing of the bam file, needed for pysam use
        if (not os.path.isfile(bamfilename + '.bai')
                and not os.path.isfile(bamfilename.replace('.bam', '.bai'))):
            print 'WARNING: index not found for ' + bamfilename + '. Indexing...'
            pysam.index(bamfilename)
            print '    Done.'

        bam = bam_file.bam_file(bamfilename, 'rb')
        bamlist.append(bam)

    sizes = numpy.array([bam.nreads() for bam in bamlist])
    minsize = sizes.min()

    print 'The smaller bam is ' + bamlist[
        sizes.argmin()].filename + ' and contains ' + str(minsize) + ' reads.'

    # Process each file and store counting results
    print 'Counting covered bases...'
    for i, bam in enumerate(bamlist):
        print '    ' + bam.filename

        # Check whether normalization should be applied
        if (normalize): normalizedbam = bam.normalize(minsize)
        else: normalizedbam = bam

        distributions.append(normalizedbam.get_coverage_distribution(beds[i]))

    draw_histogram(distributions, labels, dirout)
    draw_boxplot(distributions, labels, dirout)

    # Initialize the workbook and sheet
    wb = xlwt.Workbook()
    ws = wb.add_sheet('Coverage distribution')

    # Create header font
    header_style = xlwt.easyxf('font: bold on')

    ws.write(0, 0, 'Sample', header_style)
    ws.write(0, 1, 'Q1', header_style)
    ws.write(0, 2, 'Q2', header_style)
    ws.write(0, 3, 'Q3', header_style)
    ws.write(0, 4, 'Max. coverage', header_style)
    ws.write(0, 5, 'Min. coverage', header_style)
    ws.write(0, 6, 'Mean coverage', header_style)

    # Calculate distribution stats for each of the bams
    for i, dist in enumerate(distributions):

        #Sacamos estadisticas
        ndist = numpy.array(dist)
        p25 = numpy.percentile(ndist, 25)
        p50 = numpy.percentile(ndist, 50)
        p75 = numpy.percentile(ndist, 75)

        maximum = numpy.max(ndist)
        minimum = numpy.min(ndist)
        mean = numpy.average(ndist)

        ws.write(i + 1, 0, labels[i])
        ws.write(i + 1, 1, p25)
        ws.write(i + 1, 2, p50)
        ws.write(i + 1, 3, p75)
        ws.write(i + 1, 4, maximum)
        ws.write(i + 1, 5, minimum)
        ws.write(i + 1, 6, mean)

    wb.save(dirout + '/Coverage_stats.xls')