Exemplo n.º 1
0
def check_parameters(options,parser):

	availablefeatures = ['percbases','saturation','specificity','coveragefreq', 'coveragedistr', 'coveragestd', 'gcbias', 'coveragecorr']
	textchars = ''.join(map(chr, [7,8,9,10,12,13,27] + range(0x20, 0x100)))
	is_binary_string = lambda bytes: bool(bytes.translate(None, textchars))
	
	# Check number of arguments	
	if len(sys.argv) < 7:
		parser.print_help()
		print 'ERROR: --bams, --bed and --out parameters are required.'
		sys.exit(1)

	# Check number of arguments	
	if len(sys.argv) > 21:
		parser.print_help()
		print 'ERROR: too many parameters. Please, check that there are no spaces between commas within the "depthlist" or "coveragethrs" arguments.'
		sys.exit(1)

	try:
		bamlist = options.bams.split(',')
		if(len(bamlist)>2):
			print 'ERROR: please make sure that no more than two bam files are provided. Please, input a comma separated list. E.g.: --bams /home/user/bam1.sorted.bam,/home/user/bam2.sorted.bam'
			sys.exit(1)
	except AttributeError:
			print 'ERROR: at least one bam file is required. Please, input a comma separated list. E.g.: --bams /home/user/bam1.sorted.bam,/home/user/bam2.sorted.bam'
			sys.exit(1)
		
	for bam in bamlist:
		if(not (os.path.isfile(bam) or os.path.islink(bam))):
			print 'ERROR: '+bam+' does not exist.'
			sys.exit(1)
			
		if(not bam[-4:]=='.bam'):
			print 'ERROR: '+bam+' must have .bam extension. Please, make sure that the bam file is appropriately formatted.'
			sys.exit(1)
			
		if(not is_binary_string(open(bam).read(3))):
			print 'ERROR: '+bam+' must be a binary file. Please, make sure that the bam file is appropriately formatted.'
			sys.exit(1)
		
	try:
		if(not (os.path.isfile(options.bed) or os.path.islink(options.bed))):
			print 'ERROR: '+options.bed+' does not exist.'
			sys.exit(1)
	except AttributeError:
			print 'ERROR: the --bed file is a required parameter. Please, provide one bed file indicating target regions to analyze.'
			sys.exit(1)
		
	err = bed_file.bed_file(options.bed).checkformat()
	if(err <> ''):
		print 'ERROR: incorrect bed file format.'
		print '	'+err
		sys.exit(1)
		
	try:
		if(not (os.path.isdir(os.path.dirname(options.out)) or os.path.islink(os.path.dirname(options.out)))):
			print 'ERROR: '+os.path.dirname(options.out)+' does not exist.'
			sys.exit(1)
	except AttributeError:
			print 'ERROR: the --out parameter is required. Please, provide full path to an existing directory where results can be saved.'
			sys.exit(1)			
		
	if((os.path.isdir(options.out) or os.path.islink(options.out)) and (os.path.isdir(options.out+'/data') or os.path.islink(options.out+'/data')) and len(glob.glob(options.out+'/data/*_Ontarget_Coverage.png'))>0):
		print 'WARNING: '+options.out+' directory seems to contain previous NGScat results. Saving results of current execution in this directory may cause incorrect report generation.'
		print 'Continue with current setting? (y/n)'

		proceed = raw_input().lower()
		while(proceed<>'y' and proceed<>'n'):
			proceed = raw_input().lower()
			
		if(proceed=='n'):
			sys.exit(1)

	if(options.reference<>None and (not (os.path.isfile(options.reference) or os.path.islink(options.reference)))):
		print 'ERROR: '+options.reference+' does not exist.'
		sys.exit(1)
		
	if(options.saturation<>'y' and options.saturation<>'n'):
		print 'ERROR: incorrect value for --saturation parameters. Please indicate "y" or "n".'
		sys.exit(1)
		
	try:
		nthreads = int(options.nthreads)
	except ValueError:
		print 'ERROR: invalid value for --nthreads option. Please, provide an integer value. Note that the application will launch as many processess as it needs between 1 and nthreads.'
		sys.exit(1)
	
	if(options.depthlist<>'auto'):
		try:
			depthlist = map(float, options.depthlist.split(','))
		except ValueError:
			print 'ERROR: invalid values for --depthlist option. Please, provide a comma separated list of values without leaving spaces, e.g.: 1,2,10,20'
			sys.exit(1)

	try:
		coveragetrhesholds = map(float, options.coveragethresholds.split(','))
	except ValueError:
		print 'ERROR: invalid values for --coveragethrs option. Please, provide a comma separated list of values without leaving spaces, e.g.: 1,2,10,20'
		sys.exit(1)
		
	if(options.feature<>None and options.feature.lower() not in availablefeatures):
		print 'ERROR: '+options.feature+" not available. Please, check that the selected feature is one of the following: 'percbases','saturation','specificity','coveragefreq', 'coveragedistr', 'coveragestd', 'gcbias'"
		sys.exit(1)
			
	if(not (os.path.isdir(options.tmp) or os.path.islink(options.tmp))):
		print 'ERROR: '+options.tmp+' does not exist.'
		sys.exit(1)
	
	return True
Exemplo n.º 2
0
def ngscat(bamfilenames, originalbedfilename, outdir, reference=None, saturation=False, nthreads=2, extend=None, depthlist='auto', coveragethresholds=[1,5,10,20,30],
		   onefeature=None, tmpdir=None):
	
	global TMP
	
	if(tmpdir<>None):
		if(os.path.isdir(tmpdir) or os.path.islink(tmpdir)):
			TMP = tmpdir
		else:
			print 'ERROR: temporary directory '+tmpdir+' does not exist.'
			print '	Exiting'
			sys.exit(1)
		
	if(not (os.path.isdir(outdir)  or os.path.islink(outdir))):
		print 'WARNING: '+outdir+' does not exist. Creating directory.'
		os.mkdir(outdir)

	if(not (os.path.isdir(outdir+'/data') or os.path.islink(outdir+'/data'))):		
		print 'Creating '+outdir+'/data'
		os.mkdir(outdir+'/data')

	if(not (os.path.isdir(outdir+'/img') or os.path.islink(outdir+'/img'))):
		print 'Creating '+outdir+'/img'
		os.mkdir(outdir+'/img')
		
	sortedbams = []
	for bamfilename in bamfilenames:
		filelink = TMP+'/'+os.path.basename(bamfilename)
		try:
			os.symlink(bamfilename, filelink)
		except OSError:
			print 'WARNING: when trying to create a symbolic link at the temporary directory pointing to '+bamfilename+', a file named '+filelink+' was already found.'
			print '	Probably the temporary and origin directories are the same. The only problem this could cause is that the new index overwrites an existing one.'
			print '	Continue (y/n)?'
			
			goahead = raw_input()
			if(goahead=='n' or goahead=='N'):
				print 'Exiting...'
				sys.exit(1)
			elif(goahead<>'y' and goahead<>'Y'):
				print 'Unknown choice '+goahead
				print 'Exiting...'
				sys.exit(1)
				
			if(os.path.dirname(bamfilename)<>os.path.dirname(TMP+'/')):
				os.remove(filelink)
				os.symlink(bamfilename, filelink)
		
		print 'Indexing...'
		pysam.index(filelink)
		print '	Done.'
			
		if(not bam_file.bam_file(filelink).issorted()):
			print 'WARNING: '+bamfilename+' is not sorted'
			print 'Sorting...'
			pid = str(time.time())
			newsortedbam = TMP+'/'+pid+'.sorted'
			sortedbams.append(newsortedbam+'.bam')
			pysam.sort(filelink, newsortedbam)
			print 'Indexing...'
			pysam.index(sortedbams[-1])		
			
			print '	Done.'			
		else:
			sortedbams.append(filelink)						
	

	if(saturation and depthlist=='auto'):
		maxdepth = max([bam_file.bam_file(bamfilename).nreads() for bamfilename in sortedbams])
		depthlist = numpy.arange(maxdepth/5.0, maxdepth+(maxdepth/5.0)-1, maxdepth/5.0)
		depthlist = depthlist/1000000.0
		
		
	legend = [os.path.basename(bamfilename) for bamfilename in bamfilenames]
	executiongranted = multiprocessing.Semaphore(nthreads)

	if(extend<>None): 
		bedfilename = TMP+'/'+originalbedfilename.replace('.bed','.'+pid+'.extended.bed')
		bed_file.bed_file(originalbedfilename).extendnoref(extend,bedfilename)
	else:
		bedfilename = originalbedfilename
			
	if(onefeature==None or onefeature<>'saturation' or onefeature<>'specificity'):			
		Pcoveragebeds,coveragefiles = launch_coveragebed(sortedbams, bedfilename, legend, outdir, executiongranted)
			
	if((saturation and onefeature==None) or onefeature=='saturation'):					
		Psaturation,coverage_saturation_status,saturationslopes = launch_coverage_saturation(sortedbams, bedfilename, depthlist, legend, outdir+'/data/', executiongranted)
	else:
		coverage_saturation_status = None
		saturationslopes = None
		

	if(onefeature==None or onefeature=='specificity'):
		Ponoff_reads,onoff_status,onduplicates,offduplicates,duplicates_status,enrichment,percontarget = launch_onoff_reads(sortedbams, bedfilename, legend, outdir+'/data/', executiongranted)	

	for i in range(len(Pcoveragebeds)):
		Pcoveragebeds[i].join()
		Pcoveragebeds[i].terminate()

	if(onefeature==None or onefeature=='specificity'):
		Poffclusters = launch_offclusters(glob.glob(outdir+'/data/*.bed'), bedfilename, executiongranted)	

	if(onefeature==None or onefeature=='coveragefreq'):
		Pcoveragedistribution,coveragedistribution_status,meancoverage = launch_coverage_distribution(coveragefiles, outdir+'/data/', legend, executiongranted)	

	if(onefeature==None or onefeature=='percbases'):
		Pcoveredpositions,coveredpositions_status,coveredbases = launch_covered_positions(coveragefiles, coveragethresholds, outdir+'/data/', legend, executiongranted)

	if(onefeature==None or onefeature=='coveragedistr'):
		Pcoveragethroughtarget,throughtarget_status,lowcovbases = launch_coverage_through_target(coveragefiles, outdir+'/data/', legend, executiongranted)

	if(len(coveragefiles)>1 and (onefeature==None or onefeature=='coveragecorr')):
		Pcoveragecorr,coveragecorr_status,corr = launch_coveragecorr(coveragefiles, outdir+'/data/coveragecorr.png', legend, executiongranted)
	else:
		coveragecorr_status = None
		corr = None

	if(onefeature==None or onefeature=='coveragestd'):	
		Pcoveragestd,coveragestd_status,coveragestd = launch_coverage_std(coveragefiles, outdir+'/data/', legend, executiongranted)

	if((reference<>None and onefeature==None) or onefeature=='gcbias'):
		Pgcbias = []
		for i,coveragefile in enumerate(coveragefiles):
			onePgcbias,gcbias_status = launch_gcbias(coveragefile, bedfilename, reference, outdir+'/data/gcbias'+str(i)+'.png', legend[i], executiongranted)
			Pgcbias.append(onePgcbias)
		for onePgcbias in Pgcbias:
			onePgcbias.join()
			onePgcbias.terminate()
	else:
		gcbias_status = None
		
	# LAUNCH BASIC STATS

	if((saturation and onefeature==None) or onefeature=='saturation'):
		Psaturation.join()
		Psaturation.terminate()		

	if(onefeature==None or onefeature=='coveragefreq'):
		Pcoveragedistribution.join()
		Pcoveragedistribution.terminate()
		
	if(onefeature==None or onefeature=='percbases'):
		Pcoveredpositions.join()
		Pcoveredpositions.terminate()
	
	if(onefeature==None or onefeature=='coveragedistr'):		
		Pcoveragethroughtarget.join()
		Pcoveragethroughtarget.terminate()

	if(len(coveragefiles)>1 and (onefeature==None or onefeature=='coveragecorr')):
		Pcoveragecorr.join()
		Pcoveragecorr.terminate()
		
	if(onefeature==None or onefeature=='coveragestd'):
		Pcoveragestd.join()
		Pcoveragestd.terminate()
		
	if(onefeature==None or onefeature=='specificity'):
		Ponoff_reads.join()
		Ponoff_reads.terminate()
	
		Poffclusters.join()
		Poffclusters.terminate()
		



		
#	if(onefeature==None or onefeature<>'saturation'):
#		for coveragefile in coveragefiles:
#			os.remove(coveragefile)
	
	if(onefeature==None):
		generate_report(bamfilenames,sortedbams,originalbedfilename,outdir,coveredpositions_status,coveredbases,coverage_saturation_status,saturationslopes,
						onoff_status,
						duplicates_status,onduplicates,offduplicates,coveragedistribution_status,meancoverage,
						coveragecorr_status,corr,throughtarget_status,lowcovbases,coveragestd_status,coveragestd,gcbias_status,enrichment,percontarget,
						reference,nthreads,depthlist,
						coveragethresholds)
Exemplo n.º 3
0
def generate_report(bamfilenames,sortedbams,bedfilename,outdir,coveredpositions_status,coveredbases,coverage_saturation_status,saturationslopes,onoff_status,
					duplicates_status,onduplicates,offduplicates,coveragedistribution_status,meancoverage,
					coveragecorr_status,corr,throughtarget_status,lowcovbases,coveragestd_status,coveragestd,gcbias_status,enrichment,percontarget,
					reference,nthreads,
					depthlist,
					coveragethresholds):
	
	global TMP
	
	shutil.copy(IMGSRC+'/xls_icon.png', outdir+'/img')
	shutil.copy(IMGSRC+'/txt_icon.png', outdir+'/img')
	shutil.copy(IMGSRC+'/ok.jpg', outdir+'/img')
	shutil.copy(IMGSRC+'/warning.jpg', outdir+'/img')
	shutil.copy(IMGSRC+'/coverage_histogram_example.png', outdir+'/img')

	shutil.copy(DATASRC+'/styles.css', outdir)
	
	
	# ********************************************************* INput parameters ******************************************************************
	if(coverage_saturation_status<>None):
		saturationcurve = 'Yes'
	else:
		saturationcurve = 'No'
		
	fd = file(DATASRC+'/captureQC.html')
	reportcontent = string.join(fd.readlines(),sep='').replace('bamfilename', string.join(bamfilenames, sep=', ')).replace('bedfilename',bedfilename).replace('reportdate', time.ctime()).replace('reference',str(reference)).replace('saturationcurve',saturationcurve).replace('nthreads',str(nthreads)).replace('tmpdir',TMP)
	fd.close()



	
	
	# ********************************************************* Result summary ******************************************************************
		
	jsonstr = ''
	for i,bam in enumerate(bamfilenames):
		jsonstr += '{"bamfile":"'+bam+'"'
		jsonstr += ',"nreads":'+str(bam_file.bam_file(sortedbams[i]).nreads())
		jsonstr += ',"coveredbases":'+str(coveredbases[i])
		
		if(coverage_saturation_status<>None):
			jsonstr += ',"saturationslope":'+str(saturationslopes[i])
			
		jsonstr += ',"percontarget":'+str(percontarget[i])
		jsonstr += ',"onduplicates":'+str(onduplicates[i])
		jsonstr += ',"offduplicates":'+str(offduplicates[i])
		jsonstr += ',"meancoverage":'+str(meancoverage[i])
		jsonstr += ',"lowcovbases":'+str(lowcovbases[i])
		
		if(not math.isnan(coveragestd[i])):
			jsonstr += ',"coveragestd":'+str(coveragestd[i])+'}'
		else:
			jsonstr +='}'
		
	fd = file(outdir+'/data/summary.json', 'w')
	fd.write(jsonstr)
	fd.close()
		
	summaryrows = ''	
	for i,bam in enumerate(bamfilenames):
		summaryrows += '<tr>\n'
		summaryrows += '<td class="table-cell"> '+bam+'</td>'
		summaryrows += '<td class="table-cell"> '+str(bam_file.bam_file(sortedbams[i]).nreads())+' </td>'
		summaryrows += '<td class="table-cell">%.1f'%(coveredbases[i])+'% </td>'
		
		if(coverage_saturation_status<>None):
			summaryrows += '<td class="table-cell">%.1e</td>\n'%saturationslopes[i]
			
		summaryrows += '<td class="table-cell">%.1f'%(percontarget[i])+'% </td>\n'
		summaryrows += ('<td class="table-cell">ON-%.1f%%'%onduplicates[i])+'; OFF: %.1f'%(offduplicates[i])+'% </td>'
		summaryrows += '<td class="table-cell">%.1fx'%meancoverage[i]+'</td>\n'
		summaryrows += '<td class="table-cell">%d consecutive bases<br>with coverage <= <WARNCOVERAGETHRESHOLD></td>\n'%(lowcovbases[i])
		
		if(coveragecorr_status<>None):
			summaryrows += '<td class="table-cell">%.2f</td>\n'%corr.value
			
		summaryrows += '<td class="table-cell">%.2f</td>\n'%coveragestd[i]
		summaryrows += '</tr>\n'
		
	summarystatus = '<td class="table-header">Overall status</td>\n'
	summarystatus += '<td class="table-header"></td>\n'	
	summarystatus += '<td class="table-header"><a href="#targetbases"><img src="img/<TARGETBASESSTATUS>.jpg" height=23px /></a></td>\n'
	if(coverage_saturation_status<>None):
		summarystatus += '<td class="table-header"><a href="#coveragesaturation"><img src="img/<COVERAGESATURATIONSTATUS>.jpg" height=23px /></a></td>\n'
	summarystatus += '<td class="table-header"><a href="#onoff"><img src="img/<ONOFFSTATUS>.jpg" height=23px /></a></td>\n'
	summarystatus += '<td class="table-header"><a href="#dup"><img src="img/<DUPSTATUS>.jpg" height=23px /></a></td>\n'
	summarystatus += '<td class="table-header"><a href="#distribution"><img src="img/<DISTRIBUTIONSTATUS>.jpg" height=23px /></a></td>\n'
	summarystatus += '<td class="table-header"><a href="#coveragethroughtarget"><img src="img/<COVERAGETHROUGHTARGETSTATUS>.jpg" height=23px /></a></td>\n'
	if(coveragecorr_status<>None):
		summarystatus += '<td class="table-header"><a href="#coveragecorr"><img src="img/<COVERAGECORRSTATUS>.jpg" height=23px /></a></td>\n'
	summarystatus += '<td class="table-header"><a href="#coveragestd"><img src="img/<COVERAGESTDSTATUS>.jpg" height=23px /></a></td>\n'
		
	reportcontent = reportcontent.replace('<SUMMARYROWS>',summaryrows)
	reportcontent = reportcontent.replace('<SUMMARYSTATUS>',summarystatus)	
	
	if(coverage_saturation_status<>None):				
		reportcontent = reportcontent.replace('<SUMMARYSATURATION>','<td class="table-header"><a href="#coveragesaturation">Coverage saturation<br>(slope at the end of the curve)</a></td>')
	else:
		reportcontent = reportcontent.replace('<SUMMARYSATURATION>','')
		
	if(coveragecorr_status<>None):	
		reportcontent = reportcontent.replace('<SUMMARYCOVCORRELATION>','<td class="table-header"><a href="#coveragecorr">Coverage correlation<br>per ROI</a></td>')
	else:
		reportcontent = reportcontent.replace('<SUMMARYCOVCORRELATION>','')
			
	reportcontent = reportcontent.replace('<SUMMARYCOVERAGETHRS>',str(coveragethresholds[0]))
	reportcontent = reportcontent.replace('<SUMMARYTARGETSIZE>',str(bed_file.bed_file(bedfilename).size()))
		
	
	
	
	
	
	
	# ********************************************************* Detailed results ******************************************************************
	chromosomeimages = ''
	ontarget_coverage_files = glob.glob(outdir+'/data/*_Ontarget_Coverage.png')
	ontarget_coverage_files.sort()
	for afile in ontarget_coverage_files:
		chromosomeimages += '<a href="data/'+os.path.basename(afile)+'"><img style="width: 33%; float: left;" src="data/'+os.path.basename(afile)+'" /></a>'
	reportcontent = reportcontent.replace('<CHROMOSOMEIMAGES>',chromosomeimages)
		
	if(coveredpositions_status.value):
		reportcontent = reportcontent.replace('<TARGETBASESSTATUS>','ok')
	else:
		reportcontent = reportcontent.replace('<TARGETBASESSTATUS>','warning')
	reportcontent = reportcontent.replace('<WARNBASESCOVERED>',str(config.warnbasescovered))

	percentagestr = '\n<ul>'
	enrichmentstr = '\n<ul>'
	for i,bamfilename in enumerate(bamfilenames):
		percentagestr += '<li>'+bamfilename+': %.1f'%(percontarget[i])+'%</li>\n'
		enrichmentstr += '<li>'+bamfilename+': %.1f'%(enrichment[i])+'</li>\n'
	percentagestr += '</ul>'
	enrichmentstr += '</ul>'
	reportcontent = reportcontent.replace('<PERCENTAGEONTARGET>', percentagestr)
	reportcontent = reportcontent.replace('<ENRICHMENT>', enrichmentstr)
	
	reportcontent = reportcontent.replace('<WARNONTARGET>', str(config.warnontarget))
	if(onoff_status.value):
		reportcontent = reportcontent.replace('<ONOFFSTATUS>','ok')
	else:
		reportcontent = reportcontent.replace('<ONOFFSTATUS>','warning')
		

	duplicates_files = glob.glob(outdir+'/data/duplicates*.png')
	duplicates_files.sort()
	dupimages = ''
	for afile in duplicates_files:
		dupimages += '<img style="width: 50%; float: left;" src="data/'+os.path.basename(afile)+'" /></a>'
	reportcontent = reportcontent.replace('<DUPIMAGES>',dupimages)

	if(duplicates_status.value):
		reportcontent = reportcontent.replace('<DUPSTATUS>','ok')
	else:
		reportcontent = reportcontent.replace('<DUPSTATUS>','warning')

	reportcontent = reportcontent.replace('<WARNMEANCOVERAGE>',str(config.warnmeancoverage))
	if(coveragedistribution_status.value):
		reportcontent = reportcontent.replace('<DISTRIBUTIONSTATUS>','ok')
	else:
		reportcontent = reportcontent.replace('<DISTRIBUTIONSTATUS>','warning')

	if(coveragecorr_status<>None):
		fd = file(DATASRC+'/coveragecorr_content.html')
		coveragecorr_content = string.join(fd.readlines(), sep='')
		fd.close()
		reportcontent = reportcontent.replace('<COVERAGECORRCONTENT>',coveragecorr_content)				

		reportcontent = reportcontent.replace('<WARNCOVERAGECORRELATION>',str(config.warncoveragecorrelation))
		if(coveragecorr_status.value):
			reportcontent = reportcontent.replace('<COVERAGECORRSTATUS>','ok')
		else:
			reportcontent = reportcontent.replace('<COVERAGECORRSTATUS>','warning')
	else:										
		reportcontent = reportcontent.replace('<COVERAGECORRCONTENT>','\n')
		
	reportcontent = reportcontent.replace('<WARNCOVERAGEREGION>',str(config.warncoverageregion))
	reportcontent = reportcontent.replace('<WARNCOVERAGETHRESHOLD>',str(config.warncoveragethreshold))
	if(throughtarget_status.value):
		reportcontent = reportcontent.replace('<COVERAGETHROUGHTARGETSTATUS>','ok')
	else:
		reportcontent = reportcontent.replace('<COVERAGETHROUGHTARGETSTATUS>','warning')

	reportcontent = reportcontent.replace('<WARNSTD>',str(config.warnstd))
	if(coveragestd_status.value):
		reportcontent = reportcontent.replace('<COVERAGESTDSTATUS>','ok')
	else:
		reportcontent = reportcontent.replace('<COVERAGESTDSTATUS>','warning')

	if(coverage_saturation_status<>None):
		fd = file(DATASRC+'/saturation_content.html')
		saturation_content = string.join(fd.readlines(), sep='')
		fd.close()		
		reportcontent = reportcontent.replace('<SATURATIONCONTENT>',saturation_content).replace('<DEPTHLIST>',string.join(map(str,depthlist[:-1]),sep='x10<sup>6</sup>, ')+'x10<sup>6</sup> and '+str(depthlist[-1])+'x10<sup>6</sup>').replace('depthlist',str(depthlist)[1:-1])
		reportcontent = reportcontent.replace('<WARNSATURATION>',str(config.warnsaturation))
		
		if(coverage_saturation_status.value):
			reportcontent = reportcontent.replace('<COVERAGESATURATIONSTATUS>','ok')
		else:
			reportcontent = reportcontent.replace('<COVERAGESATURATIONSTATUS>','warning')
	else:
		reportcontent = reportcontent.replace('<SATURATIONCONTENT>','\n').replace('depthlist','None')

	reportcontent = reportcontent.replace('coveragethrs', string.join(map(str, coveragethresholds), sep=', '))
	
	if(gcbias_status<>None):
		fd = file(DATASRC+'/gcbias_content.html')
		gcbias_content = string.join(fd.readlines(), sep='')
		fd.close()
		reportcontent = reportcontent.replace('<GCBIASCONTENT>',gcbias_content)

		gcbiasimages = ''
		for afile in glob.glob(outdir+'/data/gcbias*.png'):
			gcbiasimages += '<img style="width:40%" src="data/'+os.path.basename(afile)+'" />'		
		reportcontent = reportcontent.replace('<GCBIASIMAGES>', gcbiasimages)
			
	else:
		reportcontent = reportcontent.replace('<GCBIASCONTENT>','\n')

	fd = file(outdir+'/captureQC.html', 'w')
	fd.write(reportcontent)
	fd.close()
	
	print 'Results written at '+outdir
Exemplo n.º 4
0
def gcbias_lite(coveragefile, bedfilename, reference, fileout, graphtitle=None, executiongranted=None, status=None, bedTools=False):
	"""************************************************************************************************************************************************************
	Task: draws coverage as a function of gc content. IMPROVED VERSION of gcbias that avoids the use of bedtools (pybedtools)
	Input:
		coveragefile: string containing the full path of the bam.coverage file to analyze. This file has been built according to 1-base format
		bedfilename: target file -> assumes original-standard bed file
		reference: fasta file with reference genome
		fileout: string containing the full path of the bmp file where the restulting figure will be saved.
		bedTools: whether pybedtools are used instead of the own method
	Output: a png file will be created named "fileout" where a graph that compares gc content and mean coverage will be saved.	
	************************************************************************************************************************************************************"""
	   
	if(executiongranted<>None):
		executiongranted.acquire()
	
	pid = str(os.getpid())
 
#	print 'Processing '+coveragefile
#	print 'Results will be written at '+fileout
	coverage = region_coverage(coveragefile) # Calculate mean coverage per region
	
##	fdw=file('regionCoverage.txt','w')	
##	for element in sorted(coverage.keys()):
##		fdw.write(str(element)+'\n')		
##	fdw.close()

	if(len(coverage)>1):	
		
		if not bedTools:   # Own method
#			print 'Own method'
			chromosomes={}	 
			allKeys=coverage.keys()
			
			for currentKey in allKeys:
				chromosomes[currentKey[0]]=1 # Stores all chromosomes to be examined (the ones contained in the target file)
						
			# Load BED file -> since coverage information is in 1-base format, BED format must be transformed to 1-base
			bed=bed_file.bed_file(bedfilename)
			sortedBed=bed.my_sort_bed() # Sort bed avoiding bedtools
			nonOverlappingBed=sortedBed.non_overlapping_exons(1) # Base 1!!! # This generates a BED file in base 1 (Non-standard BED)
			finalBed=nonOverlappingBed.my_sort_bed() # BED file in base 1 (Non-standard BED)
			finalBed.load_custom(-1) # Load chromosome and positions in base 1....(finalBed is in base 1 -> Non-standard BED)	
	
						
			#Load FASTA file		
			fastaFile=file(reference,'r')
			
			storeSequence=False
			wholeChromosome=''
			currentChromosome=''
			gccontent={}		
	
		
			for line in fastaFile: # Read each line of the fasta file
				if line.startswith('>'): # New chromosome starts -> reading a new line until another '>' is found
#					print 'Processing ' +line+'\n' 
					if storeSequence: # a chromosome has been read run gc bias				
						currentGCcontent=measureGCbias(wholeChromosome,currentChromosome,finalBed)
						gccontent.update(currentGCcontent) # Update dictionary
						storeSequence=False
					currentChromosome=re.split(' +',line)[0] # Format: >1 dna:chromosome chromosome:GRCh37:1:1:249250621:1
					currentChromosome=currentChromosome.split('>')[1].strip() # Chromosome string
					if(currentChromosome in chromosomes): # If current chromosome read in the FASTA file is in the list of chromosomes in the BED file
						storeSequence=True
					wholeChromosome='' # To store whole sequence for the current chromosome
				elif (not re.search('>',line) and storeSequence):
					wholeChromosome=wholeChromosome+line.rstrip() # Remove '\n' from current line and concatenates to wholeChromosome
					
	
			if(storeSequence): # For the last chromosome
					currentGCcontent=measureGCbias(wholeChromosome,currentChromosome,finalBed)
					gccontent.update(currentGCcontent)  # Update dictionary
					
			fastaFile.close()  
			region_ids=[]					
			region_ids = coverage.keys()
			
			if(len(gccontent)==0):
				print 'ERROR: G+C content values can not be calculated. Probably the provided reference file '+reference+' does not match with '
				print '	the target file '+bedfilename+'. That is, sequences of regions in the target file are probably not included within the'
				print '	reference file.'
				sys.exit(1)
			   
		else:			
			print 'Calculating nt content by means of pybedtools...'
			bed=bed_file.bed_file(bedfilename)
			sortedBed=bed.my_sort_bed() # Sort bed avoiding bedtools
			nonOverlappingBed=sortedBed.non_overlapping_exons(1) # base one!!! 
			finalBed=nonOverlappingBed.my_sort_bed() # BED file in base 1
			bedfd = pybedtools.BedTool(finalBed.filename)
			bedfd=bedfd.remove_invalid() # Remove negative coordinates or features with length=0, which do not work with bedtools
			pybedtools._bedtools_installed = True
			pybedtools.set_bedtools_path(BEDTOOLSPATH)	
			ntcontent = bedfd.nucleotide_content(reference)
				
			# Each entry in ntcontent is parsed to extract the gc content of each exon
			gccontent = {}
			for entry in ntcontent:
				gccontent[(entry.fields[0], string.atoi(entry.fields[1]), string.atoi(entry.fields[2]))] = string.atof(entry.fields[-8])*100
			print '	Done.'						
			# gccontent keys in dictionary: chromosome, exon init, exon end   
			
			region_ids=[]
			for currentKey in coverage.keys(): # Pybedtools does not work with regions with zero length -> remove them (there are a few of them)
				if currentKey[1]!=currentKey[2]:
					region_ids.append(currentKey)
						
		
##		
##		fdw=file('gcContent.txt','w')	
##		for element in sorted(gccontent.keys()):
##			fdw.write(str(element)+'\n')		
##		fdw.close()
##			
		#region_ids = gccontent.keys()
		coveragearray = numpy.array([coverage[id] for id in region_ids])
		gccontentarray = numpy.array([gccontent[id] for id in region_ids]) # Values in [0,1]	
				
#		fig = pyplot.figure(figsize=(6,6))
#		ax = fig.add_subplot(111)
#		
#		ax.hist(gccontentarray,bins=100)
#		fig.suptitle('Dsitribution of GC content regardless of coverage value')	
#		ax.set_ylabel('Frequency')
#		ax.set_xlabel('GC content')
#		ax.set_xlim(0, 100)
#		fig.savefig('distribution.png')										
					
		xmin = gccontentarray.min()
		xmax = gccontentarray.max() # Due to the imshow sentence, we need to rescale gccontent from [0,1] to [0,100]
		ymin = coveragearray.min()
		ymax = coveragearray.max()
		 
		# Perform a kernel density estimator on the results
		X, Y = mgrid[xmin:xmax:100j, ymin:ymax:100j]
		positions = c_[X.ravel(), Y.ravel()]
		values = c_[gccontentarray, coveragearray]
		kernel = stats.kde.gaussian_kde(values.T)
		Z = reshape(kernel(positions.T).T, X.T.shape)
		
		
		fig = pyplot.figure(figsize=(6,6))
		ax = fig.add_subplot(111)
		sc=ax.imshow(rot90(Z),cmap=cm.gist_earth_r,extent=[xmin, 100, ymin, ymax], aspect="auto") # Due to the imshow sentence, we need to rescale gccontent from [0,1] to [0,100]
		cbar=fig.colorbar(sc,ticks=[numpy.min(Z),numpy.max(Z)])
		cbar.ax.set_yticklabels(['Low','High'])
		cbar.set_label('Density')
		ax.set_xlabel('GC content (%)')
		ax.set_ylabel('Mean coverage')
		
		if(len(graphtitle)>25):
			ax.set_title(graphtitle[:25]+'...')
		else:
			ax.set_title(graphtitle)
			
		fig.savefig(fileout)
		matplotlib.pyplot.close(fig)
		
		if(status<>None):
			meanvalue = gccontentarray.mean()
			status.value = (meanvalue>=45 and meanvalue<=55)
		

	else:
		print 'WARNING: only one region found in the bed file. Skipping GC bias calculation.'
		
	if(executiongranted<>None):
		executiongranted.release()
Exemplo n.º 5
0
def gcbias_lite(coveragefile,
                bedfilename,
                reference,
                fileout,
                graphtitle=None,
                executiongranted=None,
                status=None,
                bedTools=False):
    """************************************************************************************************************************************************************
	Task: draws coverage as a function of gc content. IMPROVED VERSION of gcbias that avoids the use of bedtools (pybedtools)
	Input:
		coveragefile: string containing the full path of the bam.coverage file to analyze. This file has been built according to 1-base format
		bedfilename: target file -> assumes original-standard bed file
		reference: fasta file with reference genome
		fileout: string containing the full path of the bmp file where the restulting figure will be saved.
		bedTools: whether pybedtools are used instead of the own method
	Output: a png file will be created named "fileout" where a graph that compares gc content and mean coverage will be saved.	
	************************************************************************************************************************************************************"""

    if (executiongranted <> None):
        executiongranted.acquire()

    pid = str(os.getpid())

    #	print 'Processing '+coveragefile
    #	print 'Results will be written at '+fileout
    coverage = region_coverage(
        coveragefile)  # Calculate mean coverage per region

    ##	fdw=file('regionCoverage.txt','w')
    ##	for element in sorted(coverage.keys()):
    ##		fdw.write(str(element)+'\n')
    ##	fdw.close()

    if (len(coverage) > 1):

        if not bedTools:  # Own method
            #			print 'Own method'
            chromosomes = {}
            allKeys = coverage.keys()

            for currentKey in allKeys:
                chromosomes[currentKey[
                    0]] = 1  # Stores all chromosomes to be examined (the ones contained in the target file)

            # Load BED file -> since coverage information is in 1-base format, BED format must be transformed to 1-base
            bed = bed_file.bed_file(bedfilename)
            sortedBed = bed.my_sort_bed()  # Sort bed avoiding bedtools
            nonOverlappingBed = sortedBed.non_overlapping_exons(
                1
            )  # Base 1!!! # This generates a BED file in base 1 (Non-standard BED)
            finalBed = nonOverlappingBed.my_sort_bed(
            )  # BED file in base 1 (Non-standard BED)
            finalBed.load_custom(
                -1
            )  # Load chromosome and positions in base 1....(finalBed is in base 1 -> Non-standard BED)

            #Load FASTA file
            fastaFile = file(reference, 'r')

            storeSequence = False
            wholeChromosome = ''
            currentChromosome = ''
            gccontent = {}

            for line in fastaFile:  # Read each line of the fasta file
                if line.startswith(
                        '>'
                ):  # New chromosome starts -> reading a new line until another '>' is found
                    #					print 'Processing ' +line+'\n'
                    if storeSequence:  # a chromosome has been read run gc bias
                        currentGCcontent = measureGCbias(
                            wholeChromosome, currentChromosome, finalBed)
                        gccontent.update(currentGCcontent)  # Update dictionary
                        storeSequence = False
                    currentChromosome = re.split(
                        ' +', line
                    )[0]  # Format: >1 dna:chromosome chromosome:GRCh37:1:1:249250621:1
                    currentChromosome = currentChromosome.split(
                        '>')[1].strip()  # Chromosome string
                    if (
                            currentChromosome in chromosomes
                    ):  # If current chromosome read in the FASTA file is in the list of chromosomes in the BED file
                        storeSequence = True
                    wholeChromosome = ''  # To store whole sequence for the current chromosome
                elif (not re.search('>', line) and storeSequence):
                    wholeChromosome = wholeChromosome + line.rstrip(
                    )  # Remove '\n' from current line and concatenates to wholeChromosome

            if (storeSequence):  # For the last chromosome
                currentGCcontent = measureGCbias(wholeChromosome,
                                                 currentChromosome, finalBed)
                gccontent.update(currentGCcontent)  # Update dictionary

            fastaFile.close()
            region_ids = []
            region_ids = coverage.keys()

            if (len(gccontent) == 0):
                print 'ERROR: G+C content values can not be calculated. Probably the provided reference file ' + reference + ' does not match with '
                print '	the target file ' + bedfilename + '. That is, sequences of regions in the target file are probably not included within the'
                print '	reference file.'
                sys.exit(1)

        else:
            print 'Calculating nt content by means of pybedtools...'
            bed = bed_file.bed_file(bedfilename)
            sortedBed = bed.my_sort_bed()  # Sort bed avoiding bedtools
            nonOverlappingBed = sortedBed.non_overlapping_exons(
                1)  # base one!!!
            finalBed = nonOverlappingBed.my_sort_bed()  # BED file in base 1
            bedfd = pybedtools.BedTool(finalBed.filename)
            bedfd = bedfd.remove_invalid(
            )  # Remove negative coordinates or features with length=0, which do not work with bedtools
            pybedtools._bedtools_installed = True
            pybedtools.set_bedtools_path(BEDTOOLSPATH)
            ntcontent = bedfd.nucleotide_content(reference)

            # Each entry in ntcontent is parsed to extract the gc content of each exon
            gccontent = {}
            for entry in ntcontent:
                gccontent[(entry.fields[0], string.atoi(entry.fields[1]),
                           string.atoi(entry.fields[2]))] = string.atof(
                               entry.fields[-8]) * 100
            print '	Done.'
            # gccontent keys in dictionary: chromosome, exon init, exon end

            region_ids = []
            for currentKey in coverage.keys(
            ):  # Pybedtools does not work with regions with zero length -> remove them (there are a few of them)
                if currentKey[1] != currentKey[2]:
                    region_ids.append(currentKey)

##
##		fdw=file('gcContent.txt','w')
##		for element in sorted(gccontent.keys()):
##			fdw.write(str(element)+'\n')
##		fdw.close()
##
#region_ids = gccontent.keys()
        coveragearray = numpy.array([coverage[id] for id in region_ids])
        gccontentarray = numpy.array([gccontent[id]
                                      for id in region_ids])  # Values in [0,1]

        #		fig = pyplot.figure(figsize=(6,6))
        #		ax = fig.add_subplot(111)
        #
        #		ax.hist(gccontentarray,bins=100)
        #		fig.suptitle('Dsitribution of GC content regardless of coverage value')
        #		ax.set_ylabel('Frequency')
        #		ax.set_xlabel('GC content')
        #		ax.set_xlim(0, 100)
        #		fig.savefig('distribution.png')

        xmin = gccontentarray.min()
        xmax = gccontentarray.max(
        )  # Due to the imshow sentence, we need to rescale gccontent from [0,1] to [0,100]
        ymin = coveragearray.min()
        ymax = coveragearray.max()

        # Perform a kernel density estimator on the results
        X, Y = mgrid[xmin:xmax:100j, ymin:ymax:100j]
        positions = c_[X.ravel(), Y.ravel()]
        values = c_[gccontentarray, coveragearray]
        kernel = stats.kde.gaussian_kde(values.T)
        Z = reshape(kernel(positions.T).T, X.T.shape)

        fig = pyplot.figure(figsize=(6, 6))
        ax = fig.add_subplot(111)
        sc = ax.imshow(
            rot90(Z),
            cmap=cm.gist_earth_r,
            extent=[xmin, 100, ymin, ymax],
            aspect="auto"
        )  # Due to the imshow sentence, we need to rescale gccontent from [0,1] to [0,100]
        cbar = fig.colorbar(sc, ticks=[numpy.min(Z), numpy.max(Z)])
        cbar.ax.set_yticklabels(['Low', 'High'])
        cbar.set_label('Density')
        ax.set_xlabel('GC content (%)')
        ax.set_ylabel('Mean coverage')

        if (len(graphtitle) > 25):
            ax.set_title(graphtitle[:25] + '...')
        else:
            ax.set_title(graphtitle)

        fig.savefig(fileout)
        matplotlib.pyplot.close(fig)

        if (status <> None):
            meanvalue = gccontentarray.mean()
            status.value = (meanvalue >= 45 and meanvalue <= 55)

    else:
        print 'WARNING: only one region found in the bed file. Skipping GC bias calculation.'

    if (executiongranted <> None):
        executiongranted.release()
Exemplo n.º 6
0
    def getOffTarget(self,offset,coverageThreshold,target,outfile,tmpdir=None):
        """************************************************************************************************************************************************************
        Task: selects off-tareget(+offset) regions with a coverage >  coverageThreshold
        Inputs:       
            offset: integer indicating the number of bases to extend the target.
            coverageThreshold: integer indicating the coverage threshold to select the region
            target: ROIs bed file
        Ouputs: a new bedgraph file will be created containing selected regions.
        ************************************************************************************************************************************************************"""
              
        pid = str(os.getpid())
        tmpbed = tmpdir+'/'+pid+'.extended.bed'
        
        bed=bed_file.bed_file(target)
        extendedBed=bed.extendnoref(offset,tmpbed)
        sortedBed=extendedBed.my_sort_bed()
        nonOverlappingBed=sortedBed.non_overlapping_exons(-1) # Base 0, it is a standard BED
        finalBed=nonOverlappingBed.my_sort_bed() # BED file in base 0
        finalBed.load_custom(-1) # Load chromosome and positions in base 0                 
        bed_region=finalBed.get_region()
        bed_index=0 #index to control bed_region position
        
        
        fd=file(self.filename)
        header=fd.readline()
        reading=True #boolean to control while loop
        chr_found=False
        batch_n=1
        fdw=file(outfile,'w')
        
        while reading:
            batch,fd=self.get_batch(fd, 10000000)
#            print batch_n
            batch_n=batch_n+1
            
            if batch==[]:
                reading=False
            else:
                for line in batch:
                    aline=line.replace('\n','').split(' ')
                    #new region 
                    r=region.region(aline[0],aline[1],aline[2],aline[3])
                    search_open=True
 
                    while search_open:
                        type_overlap=r.overlap_type(bed_region[bed_index])

                        
                        if type_overlap==0: #bed region comes before bedgraph region
                            search_open=True
                            
                            if bed_index+1<len(bed_region) and (chr_found==False or (chr_found==True and r.chrom==bed_region[bed_index].chrom)):
                                bed_index=bed_index+1
                            elif r.value>=coverageThreshold:
                                search_open=False 
                                for region_selected in r-bed_region[bed_index]:
                                    fdw.write(str(region_selected))
                            else:
                                search_open=False 
                                                               
                        
                        elif type_overlap==-1: #bed region comes after bedgraph region
                            search_open=False
                            chr_found=True
                            if r.value>=coverageThreshold:
                                for region_selected in r-bed_region[bed_index]:
                                    fdw.write(str(region_selected))
                                
                        else:
                            search_open=False
                            chr_found=True
                            if r.value>=coverageThreshold:
                                for region_selected in r-bed_region[bed_index]:
                                    fdw.write(str(region_selected))
        fd.close()