def check_parameters(options,parser): availablefeatures = ['percbases','saturation','specificity','coveragefreq', 'coveragedistr', 'coveragestd', 'gcbias', 'coveragecorr'] textchars = ''.join(map(chr, [7,8,9,10,12,13,27] + range(0x20, 0x100))) is_binary_string = lambda bytes: bool(bytes.translate(None, textchars)) # Check number of arguments if len(sys.argv) < 7: parser.print_help() print 'ERROR: --bams, --bed and --out parameters are required.' sys.exit(1) # Check number of arguments if len(sys.argv) > 21: parser.print_help() print 'ERROR: too many parameters. Please, check that there are no spaces between commas within the "depthlist" or "coveragethrs" arguments.' sys.exit(1) try: bamlist = options.bams.split(',') if(len(bamlist)>2): print 'ERROR: please make sure that no more than two bam files are provided. Please, input a comma separated list. E.g.: --bams /home/user/bam1.sorted.bam,/home/user/bam2.sorted.bam' sys.exit(1) except AttributeError: print 'ERROR: at least one bam file is required. Please, input a comma separated list. E.g.: --bams /home/user/bam1.sorted.bam,/home/user/bam2.sorted.bam' sys.exit(1) for bam in bamlist: if(not (os.path.isfile(bam) or os.path.islink(bam))): print 'ERROR: '+bam+' does not exist.' sys.exit(1) if(not bam[-4:]=='.bam'): print 'ERROR: '+bam+' must have .bam extension. Please, make sure that the bam file is appropriately formatted.' sys.exit(1) if(not is_binary_string(open(bam).read(3))): print 'ERROR: '+bam+' must be a binary file. Please, make sure that the bam file is appropriately formatted.' sys.exit(1) try: if(not (os.path.isfile(options.bed) or os.path.islink(options.bed))): print 'ERROR: '+options.bed+' does not exist.' sys.exit(1) except AttributeError: print 'ERROR: the --bed file is a required parameter. Please, provide one bed file indicating target regions to analyze.' sys.exit(1) err = bed_file.bed_file(options.bed).checkformat() if(err <> ''): print 'ERROR: incorrect bed file format.' print ' '+err sys.exit(1) try: if(not (os.path.isdir(os.path.dirname(options.out)) or os.path.islink(os.path.dirname(options.out)))): print 'ERROR: '+os.path.dirname(options.out)+' does not exist.' sys.exit(1) except AttributeError: print 'ERROR: the --out parameter is required. Please, provide full path to an existing directory where results can be saved.' sys.exit(1) if((os.path.isdir(options.out) or os.path.islink(options.out)) and (os.path.isdir(options.out+'/data') or os.path.islink(options.out+'/data')) and len(glob.glob(options.out+'/data/*_Ontarget_Coverage.png'))>0): print 'WARNING: '+options.out+' directory seems to contain previous NGScat results. Saving results of current execution in this directory may cause incorrect report generation.' print 'Continue with current setting? (y/n)' proceed = raw_input().lower() while(proceed<>'y' and proceed<>'n'): proceed = raw_input().lower() if(proceed=='n'): sys.exit(1) if(options.reference<>None and (not (os.path.isfile(options.reference) or os.path.islink(options.reference)))): print 'ERROR: '+options.reference+' does not exist.' sys.exit(1) if(options.saturation<>'y' and options.saturation<>'n'): print 'ERROR: incorrect value for --saturation parameters. Please indicate "y" or "n".' sys.exit(1) try: nthreads = int(options.nthreads) except ValueError: print 'ERROR: invalid value for --nthreads option. Please, provide an integer value. Note that the application will launch as many processess as it needs between 1 and nthreads.' sys.exit(1) if(options.depthlist<>'auto'): try: depthlist = map(float, options.depthlist.split(',')) except ValueError: print 'ERROR: invalid values for --depthlist option. Please, provide a comma separated list of values without leaving spaces, e.g.: 1,2,10,20' sys.exit(1) try: coveragetrhesholds = map(float, options.coveragethresholds.split(',')) except ValueError: print 'ERROR: invalid values for --coveragethrs option. Please, provide a comma separated list of values without leaving spaces, e.g.: 1,2,10,20' sys.exit(1) if(options.feature<>None and options.feature.lower() not in availablefeatures): print 'ERROR: '+options.feature+" not available. Please, check that the selected feature is one of the following: 'percbases','saturation','specificity','coveragefreq', 'coveragedistr', 'coveragestd', 'gcbias'" sys.exit(1) if(not (os.path.isdir(options.tmp) or os.path.islink(options.tmp))): print 'ERROR: '+options.tmp+' does not exist.' sys.exit(1) return True
def ngscat(bamfilenames, originalbedfilename, outdir, reference=None, saturation=False, nthreads=2, extend=None, depthlist='auto', coveragethresholds=[1,5,10,20,30], onefeature=None, tmpdir=None): global TMP if(tmpdir<>None): if(os.path.isdir(tmpdir) or os.path.islink(tmpdir)): TMP = tmpdir else: print 'ERROR: temporary directory '+tmpdir+' does not exist.' print ' Exiting' sys.exit(1) if(not (os.path.isdir(outdir) or os.path.islink(outdir))): print 'WARNING: '+outdir+' does not exist. Creating directory.' os.mkdir(outdir) if(not (os.path.isdir(outdir+'/data') or os.path.islink(outdir+'/data'))): print 'Creating '+outdir+'/data' os.mkdir(outdir+'/data') if(not (os.path.isdir(outdir+'/img') or os.path.islink(outdir+'/img'))): print 'Creating '+outdir+'/img' os.mkdir(outdir+'/img') sortedbams = [] for bamfilename in bamfilenames: filelink = TMP+'/'+os.path.basename(bamfilename) try: os.symlink(bamfilename, filelink) except OSError: print 'WARNING: when trying to create a symbolic link at the temporary directory pointing to '+bamfilename+', a file named '+filelink+' was already found.' print ' Probably the temporary and origin directories are the same. The only problem this could cause is that the new index overwrites an existing one.' print ' Continue (y/n)?' goahead = raw_input() if(goahead=='n' or goahead=='N'): print 'Exiting...' sys.exit(1) elif(goahead<>'y' and goahead<>'Y'): print 'Unknown choice '+goahead print 'Exiting...' sys.exit(1) if(os.path.dirname(bamfilename)<>os.path.dirname(TMP+'/')): os.remove(filelink) os.symlink(bamfilename, filelink) print 'Indexing...' pysam.index(filelink) print ' Done.' if(not bam_file.bam_file(filelink).issorted()): print 'WARNING: '+bamfilename+' is not sorted' print 'Sorting...' pid = str(time.time()) newsortedbam = TMP+'/'+pid+'.sorted' sortedbams.append(newsortedbam+'.bam') pysam.sort(filelink, newsortedbam) print 'Indexing...' pysam.index(sortedbams[-1]) print ' Done.' else: sortedbams.append(filelink) if(saturation and depthlist=='auto'): maxdepth = max([bam_file.bam_file(bamfilename).nreads() for bamfilename in sortedbams]) depthlist = numpy.arange(maxdepth/5.0, maxdepth+(maxdepth/5.0)-1, maxdepth/5.0) depthlist = depthlist/1000000.0 legend = [os.path.basename(bamfilename) for bamfilename in bamfilenames] executiongranted = multiprocessing.Semaphore(nthreads) if(extend<>None): bedfilename = TMP+'/'+originalbedfilename.replace('.bed','.'+pid+'.extended.bed') bed_file.bed_file(originalbedfilename).extendnoref(extend,bedfilename) else: bedfilename = originalbedfilename if(onefeature==None or onefeature<>'saturation' or onefeature<>'specificity'): Pcoveragebeds,coveragefiles = launch_coveragebed(sortedbams, bedfilename, legend, outdir, executiongranted) if((saturation and onefeature==None) or onefeature=='saturation'): Psaturation,coverage_saturation_status,saturationslopes = launch_coverage_saturation(sortedbams, bedfilename, depthlist, legend, outdir+'/data/', executiongranted) else: coverage_saturation_status = None saturationslopes = None if(onefeature==None or onefeature=='specificity'): Ponoff_reads,onoff_status,onduplicates,offduplicates,duplicates_status,enrichment,percontarget = launch_onoff_reads(sortedbams, bedfilename, legend, outdir+'/data/', executiongranted) for i in range(len(Pcoveragebeds)): Pcoveragebeds[i].join() Pcoveragebeds[i].terminate() if(onefeature==None or onefeature=='specificity'): Poffclusters = launch_offclusters(glob.glob(outdir+'/data/*.bed'), bedfilename, executiongranted) if(onefeature==None or onefeature=='coveragefreq'): Pcoveragedistribution,coveragedistribution_status,meancoverage = launch_coverage_distribution(coveragefiles, outdir+'/data/', legend, executiongranted) if(onefeature==None or onefeature=='percbases'): Pcoveredpositions,coveredpositions_status,coveredbases = launch_covered_positions(coveragefiles, coveragethresholds, outdir+'/data/', legend, executiongranted) if(onefeature==None or onefeature=='coveragedistr'): Pcoveragethroughtarget,throughtarget_status,lowcovbases = launch_coverage_through_target(coveragefiles, outdir+'/data/', legend, executiongranted) if(len(coveragefiles)>1 and (onefeature==None or onefeature=='coveragecorr')): Pcoveragecorr,coveragecorr_status,corr = launch_coveragecorr(coveragefiles, outdir+'/data/coveragecorr.png', legend, executiongranted) else: coveragecorr_status = None corr = None if(onefeature==None or onefeature=='coveragestd'): Pcoveragestd,coveragestd_status,coveragestd = launch_coverage_std(coveragefiles, outdir+'/data/', legend, executiongranted) if((reference<>None and onefeature==None) or onefeature=='gcbias'): Pgcbias = [] for i,coveragefile in enumerate(coveragefiles): onePgcbias,gcbias_status = launch_gcbias(coveragefile, bedfilename, reference, outdir+'/data/gcbias'+str(i)+'.png', legend[i], executiongranted) Pgcbias.append(onePgcbias) for onePgcbias in Pgcbias: onePgcbias.join() onePgcbias.terminate() else: gcbias_status = None # LAUNCH BASIC STATS if((saturation and onefeature==None) or onefeature=='saturation'): Psaturation.join() Psaturation.terminate() if(onefeature==None or onefeature=='coveragefreq'): Pcoveragedistribution.join() Pcoveragedistribution.terminate() if(onefeature==None or onefeature=='percbases'): Pcoveredpositions.join() Pcoveredpositions.terminate() if(onefeature==None or onefeature=='coveragedistr'): Pcoveragethroughtarget.join() Pcoveragethroughtarget.terminate() if(len(coveragefiles)>1 and (onefeature==None or onefeature=='coveragecorr')): Pcoveragecorr.join() Pcoveragecorr.terminate() if(onefeature==None or onefeature=='coveragestd'): Pcoveragestd.join() Pcoveragestd.terminate() if(onefeature==None or onefeature=='specificity'): Ponoff_reads.join() Ponoff_reads.terminate() Poffclusters.join() Poffclusters.terminate() # if(onefeature==None or onefeature<>'saturation'): # for coveragefile in coveragefiles: # os.remove(coveragefile) if(onefeature==None): generate_report(bamfilenames,sortedbams,originalbedfilename,outdir,coveredpositions_status,coveredbases,coverage_saturation_status,saturationslopes, onoff_status, duplicates_status,onduplicates,offduplicates,coveragedistribution_status,meancoverage, coveragecorr_status,corr,throughtarget_status,lowcovbases,coveragestd_status,coveragestd,gcbias_status,enrichment,percontarget, reference,nthreads,depthlist, coveragethresholds)
def generate_report(bamfilenames,sortedbams,bedfilename,outdir,coveredpositions_status,coveredbases,coverage_saturation_status,saturationslopes,onoff_status, duplicates_status,onduplicates,offduplicates,coveragedistribution_status,meancoverage, coveragecorr_status,corr,throughtarget_status,lowcovbases,coveragestd_status,coveragestd,gcbias_status,enrichment,percontarget, reference,nthreads, depthlist, coveragethresholds): global TMP shutil.copy(IMGSRC+'/xls_icon.png', outdir+'/img') shutil.copy(IMGSRC+'/txt_icon.png', outdir+'/img') shutil.copy(IMGSRC+'/ok.jpg', outdir+'/img') shutil.copy(IMGSRC+'/warning.jpg', outdir+'/img') shutil.copy(IMGSRC+'/coverage_histogram_example.png', outdir+'/img') shutil.copy(DATASRC+'/styles.css', outdir) # ********************************************************* INput parameters ****************************************************************** if(coverage_saturation_status<>None): saturationcurve = 'Yes' else: saturationcurve = 'No' fd = file(DATASRC+'/captureQC.html') reportcontent = string.join(fd.readlines(),sep='').replace('bamfilename', string.join(bamfilenames, sep=', ')).replace('bedfilename',bedfilename).replace('reportdate', time.ctime()).replace('reference',str(reference)).replace('saturationcurve',saturationcurve).replace('nthreads',str(nthreads)).replace('tmpdir',TMP) fd.close() # ********************************************************* Result summary ****************************************************************** jsonstr = '' for i,bam in enumerate(bamfilenames): jsonstr += '{"bamfile":"'+bam+'"' jsonstr += ',"nreads":'+str(bam_file.bam_file(sortedbams[i]).nreads()) jsonstr += ',"coveredbases":'+str(coveredbases[i]) if(coverage_saturation_status<>None): jsonstr += ',"saturationslope":'+str(saturationslopes[i]) jsonstr += ',"percontarget":'+str(percontarget[i]) jsonstr += ',"onduplicates":'+str(onduplicates[i]) jsonstr += ',"offduplicates":'+str(offduplicates[i]) jsonstr += ',"meancoverage":'+str(meancoverage[i]) jsonstr += ',"lowcovbases":'+str(lowcovbases[i]) if(not math.isnan(coveragestd[i])): jsonstr += ',"coveragestd":'+str(coveragestd[i])+'}' else: jsonstr +='}' fd = file(outdir+'/data/summary.json', 'w') fd.write(jsonstr) fd.close() summaryrows = '' for i,bam in enumerate(bamfilenames): summaryrows += '<tr>\n' summaryrows += '<td class="table-cell"> '+bam+'</td>' summaryrows += '<td class="table-cell"> '+str(bam_file.bam_file(sortedbams[i]).nreads())+' </td>' summaryrows += '<td class="table-cell">%.1f'%(coveredbases[i])+'% </td>' if(coverage_saturation_status<>None): summaryrows += '<td class="table-cell">%.1e</td>\n'%saturationslopes[i] summaryrows += '<td class="table-cell">%.1f'%(percontarget[i])+'% </td>\n' summaryrows += ('<td class="table-cell">ON-%.1f%%'%onduplicates[i])+'; OFF: %.1f'%(offduplicates[i])+'% </td>' summaryrows += '<td class="table-cell">%.1fx'%meancoverage[i]+'</td>\n' summaryrows += '<td class="table-cell">%d consecutive bases<br>with coverage <= <WARNCOVERAGETHRESHOLD></td>\n'%(lowcovbases[i]) if(coveragecorr_status<>None): summaryrows += '<td class="table-cell">%.2f</td>\n'%corr.value summaryrows += '<td class="table-cell">%.2f</td>\n'%coveragestd[i] summaryrows += '</tr>\n' summarystatus = '<td class="table-header">Overall status</td>\n' summarystatus += '<td class="table-header"></td>\n' summarystatus += '<td class="table-header"><a href="#targetbases"><img src="img/<TARGETBASESSTATUS>.jpg" height=23px /></a></td>\n' if(coverage_saturation_status<>None): summarystatus += '<td class="table-header"><a href="#coveragesaturation"><img src="img/<COVERAGESATURATIONSTATUS>.jpg" height=23px /></a></td>\n' summarystatus += '<td class="table-header"><a href="#onoff"><img src="img/<ONOFFSTATUS>.jpg" height=23px /></a></td>\n' summarystatus += '<td class="table-header"><a href="#dup"><img src="img/<DUPSTATUS>.jpg" height=23px /></a></td>\n' summarystatus += '<td class="table-header"><a href="#distribution"><img src="img/<DISTRIBUTIONSTATUS>.jpg" height=23px /></a></td>\n' summarystatus += '<td class="table-header"><a href="#coveragethroughtarget"><img src="img/<COVERAGETHROUGHTARGETSTATUS>.jpg" height=23px /></a></td>\n' if(coveragecorr_status<>None): summarystatus += '<td class="table-header"><a href="#coveragecorr"><img src="img/<COVERAGECORRSTATUS>.jpg" height=23px /></a></td>\n' summarystatus += '<td class="table-header"><a href="#coveragestd"><img src="img/<COVERAGESTDSTATUS>.jpg" height=23px /></a></td>\n' reportcontent = reportcontent.replace('<SUMMARYROWS>',summaryrows) reportcontent = reportcontent.replace('<SUMMARYSTATUS>',summarystatus) if(coverage_saturation_status<>None): reportcontent = reportcontent.replace('<SUMMARYSATURATION>','<td class="table-header"><a href="#coveragesaturation">Coverage saturation<br>(slope at the end of the curve)</a></td>') else: reportcontent = reportcontent.replace('<SUMMARYSATURATION>','') if(coveragecorr_status<>None): reportcontent = reportcontent.replace('<SUMMARYCOVCORRELATION>','<td class="table-header"><a href="#coveragecorr">Coverage correlation<br>per ROI</a></td>') else: reportcontent = reportcontent.replace('<SUMMARYCOVCORRELATION>','') reportcontent = reportcontent.replace('<SUMMARYCOVERAGETHRS>',str(coveragethresholds[0])) reportcontent = reportcontent.replace('<SUMMARYTARGETSIZE>',str(bed_file.bed_file(bedfilename).size())) # ********************************************************* Detailed results ****************************************************************** chromosomeimages = '' ontarget_coverage_files = glob.glob(outdir+'/data/*_Ontarget_Coverage.png') ontarget_coverage_files.sort() for afile in ontarget_coverage_files: chromosomeimages += '<a href="data/'+os.path.basename(afile)+'"><img style="width: 33%; float: left;" src="data/'+os.path.basename(afile)+'" /></a>' reportcontent = reportcontent.replace('<CHROMOSOMEIMAGES>',chromosomeimages) if(coveredpositions_status.value): reportcontent = reportcontent.replace('<TARGETBASESSTATUS>','ok') else: reportcontent = reportcontent.replace('<TARGETBASESSTATUS>','warning') reportcontent = reportcontent.replace('<WARNBASESCOVERED>',str(config.warnbasescovered)) percentagestr = '\n<ul>' enrichmentstr = '\n<ul>' for i,bamfilename in enumerate(bamfilenames): percentagestr += '<li>'+bamfilename+': %.1f'%(percontarget[i])+'%</li>\n' enrichmentstr += '<li>'+bamfilename+': %.1f'%(enrichment[i])+'</li>\n' percentagestr += '</ul>' enrichmentstr += '</ul>' reportcontent = reportcontent.replace('<PERCENTAGEONTARGET>', percentagestr) reportcontent = reportcontent.replace('<ENRICHMENT>', enrichmentstr) reportcontent = reportcontent.replace('<WARNONTARGET>', str(config.warnontarget)) if(onoff_status.value): reportcontent = reportcontent.replace('<ONOFFSTATUS>','ok') else: reportcontent = reportcontent.replace('<ONOFFSTATUS>','warning') duplicates_files = glob.glob(outdir+'/data/duplicates*.png') duplicates_files.sort() dupimages = '' for afile in duplicates_files: dupimages += '<img style="width: 50%; float: left;" src="data/'+os.path.basename(afile)+'" /></a>' reportcontent = reportcontent.replace('<DUPIMAGES>',dupimages) if(duplicates_status.value): reportcontent = reportcontent.replace('<DUPSTATUS>','ok') else: reportcontent = reportcontent.replace('<DUPSTATUS>','warning') reportcontent = reportcontent.replace('<WARNMEANCOVERAGE>',str(config.warnmeancoverage)) if(coveragedistribution_status.value): reportcontent = reportcontent.replace('<DISTRIBUTIONSTATUS>','ok') else: reportcontent = reportcontent.replace('<DISTRIBUTIONSTATUS>','warning') if(coveragecorr_status<>None): fd = file(DATASRC+'/coveragecorr_content.html') coveragecorr_content = string.join(fd.readlines(), sep='') fd.close() reportcontent = reportcontent.replace('<COVERAGECORRCONTENT>',coveragecorr_content) reportcontent = reportcontent.replace('<WARNCOVERAGECORRELATION>',str(config.warncoveragecorrelation)) if(coveragecorr_status.value): reportcontent = reportcontent.replace('<COVERAGECORRSTATUS>','ok') else: reportcontent = reportcontent.replace('<COVERAGECORRSTATUS>','warning') else: reportcontent = reportcontent.replace('<COVERAGECORRCONTENT>','\n') reportcontent = reportcontent.replace('<WARNCOVERAGEREGION>',str(config.warncoverageregion)) reportcontent = reportcontent.replace('<WARNCOVERAGETHRESHOLD>',str(config.warncoveragethreshold)) if(throughtarget_status.value): reportcontent = reportcontent.replace('<COVERAGETHROUGHTARGETSTATUS>','ok') else: reportcontent = reportcontent.replace('<COVERAGETHROUGHTARGETSTATUS>','warning') reportcontent = reportcontent.replace('<WARNSTD>',str(config.warnstd)) if(coveragestd_status.value): reportcontent = reportcontent.replace('<COVERAGESTDSTATUS>','ok') else: reportcontent = reportcontent.replace('<COVERAGESTDSTATUS>','warning') if(coverage_saturation_status<>None): fd = file(DATASRC+'/saturation_content.html') saturation_content = string.join(fd.readlines(), sep='') fd.close() reportcontent = reportcontent.replace('<SATURATIONCONTENT>',saturation_content).replace('<DEPTHLIST>',string.join(map(str,depthlist[:-1]),sep='x10<sup>6</sup>, ')+'x10<sup>6</sup> and '+str(depthlist[-1])+'x10<sup>6</sup>').replace('depthlist',str(depthlist)[1:-1]) reportcontent = reportcontent.replace('<WARNSATURATION>',str(config.warnsaturation)) if(coverage_saturation_status.value): reportcontent = reportcontent.replace('<COVERAGESATURATIONSTATUS>','ok') else: reportcontent = reportcontent.replace('<COVERAGESATURATIONSTATUS>','warning') else: reportcontent = reportcontent.replace('<SATURATIONCONTENT>','\n').replace('depthlist','None') reportcontent = reportcontent.replace('coveragethrs', string.join(map(str, coveragethresholds), sep=', ')) if(gcbias_status<>None): fd = file(DATASRC+'/gcbias_content.html') gcbias_content = string.join(fd.readlines(), sep='') fd.close() reportcontent = reportcontent.replace('<GCBIASCONTENT>',gcbias_content) gcbiasimages = '' for afile in glob.glob(outdir+'/data/gcbias*.png'): gcbiasimages += '<img style="width:40%" src="data/'+os.path.basename(afile)+'" />' reportcontent = reportcontent.replace('<GCBIASIMAGES>', gcbiasimages) else: reportcontent = reportcontent.replace('<GCBIASCONTENT>','\n') fd = file(outdir+'/captureQC.html', 'w') fd.write(reportcontent) fd.close() print 'Results written at '+outdir
def gcbias_lite(coveragefile, bedfilename, reference, fileout, graphtitle=None, executiongranted=None, status=None, bedTools=False): """************************************************************************************************************************************************************ Task: draws coverage as a function of gc content. IMPROVED VERSION of gcbias that avoids the use of bedtools (pybedtools) Input: coveragefile: string containing the full path of the bam.coverage file to analyze. This file has been built according to 1-base format bedfilename: target file -> assumes original-standard bed file reference: fasta file with reference genome fileout: string containing the full path of the bmp file where the restulting figure will be saved. bedTools: whether pybedtools are used instead of the own method Output: a png file will be created named "fileout" where a graph that compares gc content and mean coverage will be saved. ************************************************************************************************************************************************************""" if(executiongranted<>None): executiongranted.acquire() pid = str(os.getpid()) # print 'Processing '+coveragefile # print 'Results will be written at '+fileout coverage = region_coverage(coveragefile) # Calculate mean coverage per region ## fdw=file('regionCoverage.txt','w') ## for element in sorted(coverage.keys()): ## fdw.write(str(element)+'\n') ## fdw.close() if(len(coverage)>1): if not bedTools: # Own method # print 'Own method' chromosomes={} allKeys=coverage.keys() for currentKey in allKeys: chromosomes[currentKey[0]]=1 # Stores all chromosomes to be examined (the ones contained in the target file) # Load BED file -> since coverage information is in 1-base format, BED format must be transformed to 1-base bed=bed_file.bed_file(bedfilename) sortedBed=bed.my_sort_bed() # Sort bed avoiding bedtools nonOverlappingBed=sortedBed.non_overlapping_exons(1) # Base 1!!! # This generates a BED file in base 1 (Non-standard BED) finalBed=nonOverlappingBed.my_sort_bed() # BED file in base 1 (Non-standard BED) finalBed.load_custom(-1) # Load chromosome and positions in base 1....(finalBed is in base 1 -> Non-standard BED) #Load FASTA file fastaFile=file(reference,'r') storeSequence=False wholeChromosome='' currentChromosome='' gccontent={} for line in fastaFile: # Read each line of the fasta file if line.startswith('>'): # New chromosome starts -> reading a new line until another '>' is found # print 'Processing ' +line+'\n' if storeSequence: # a chromosome has been read run gc bias currentGCcontent=measureGCbias(wholeChromosome,currentChromosome,finalBed) gccontent.update(currentGCcontent) # Update dictionary storeSequence=False currentChromosome=re.split(' +',line)[0] # Format: >1 dna:chromosome chromosome:GRCh37:1:1:249250621:1 currentChromosome=currentChromosome.split('>')[1].strip() # Chromosome string if(currentChromosome in chromosomes): # If current chromosome read in the FASTA file is in the list of chromosomes in the BED file storeSequence=True wholeChromosome='' # To store whole sequence for the current chromosome elif (not re.search('>',line) and storeSequence): wholeChromosome=wholeChromosome+line.rstrip() # Remove '\n' from current line and concatenates to wholeChromosome if(storeSequence): # For the last chromosome currentGCcontent=measureGCbias(wholeChromosome,currentChromosome,finalBed) gccontent.update(currentGCcontent) # Update dictionary fastaFile.close() region_ids=[] region_ids = coverage.keys() if(len(gccontent)==0): print 'ERROR: G+C content values can not be calculated. Probably the provided reference file '+reference+' does not match with ' print ' the target file '+bedfilename+'. That is, sequences of regions in the target file are probably not included within the' print ' reference file.' sys.exit(1) else: print 'Calculating nt content by means of pybedtools...' bed=bed_file.bed_file(bedfilename) sortedBed=bed.my_sort_bed() # Sort bed avoiding bedtools nonOverlappingBed=sortedBed.non_overlapping_exons(1) # base one!!! finalBed=nonOverlappingBed.my_sort_bed() # BED file in base 1 bedfd = pybedtools.BedTool(finalBed.filename) bedfd=bedfd.remove_invalid() # Remove negative coordinates or features with length=0, which do not work with bedtools pybedtools._bedtools_installed = True pybedtools.set_bedtools_path(BEDTOOLSPATH) ntcontent = bedfd.nucleotide_content(reference) # Each entry in ntcontent is parsed to extract the gc content of each exon gccontent = {} for entry in ntcontent: gccontent[(entry.fields[0], string.atoi(entry.fields[1]), string.atoi(entry.fields[2]))] = string.atof(entry.fields[-8])*100 print ' Done.' # gccontent keys in dictionary: chromosome, exon init, exon end region_ids=[] for currentKey in coverage.keys(): # Pybedtools does not work with regions with zero length -> remove them (there are a few of them) if currentKey[1]!=currentKey[2]: region_ids.append(currentKey) ## ## fdw=file('gcContent.txt','w') ## for element in sorted(gccontent.keys()): ## fdw.write(str(element)+'\n') ## fdw.close() ## #region_ids = gccontent.keys() coveragearray = numpy.array([coverage[id] for id in region_ids]) gccontentarray = numpy.array([gccontent[id] for id in region_ids]) # Values in [0,1] # fig = pyplot.figure(figsize=(6,6)) # ax = fig.add_subplot(111) # # ax.hist(gccontentarray,bins=100) # fig.suptitle('Dsitribution of GC content regardless of coverage value') # ax.set_ylabel('Frequency') # ax.set_xlabel('GC content') # ax.set_xlim(0, 100) # fig.savefig('distribution.png') xmin = gccontentarray.min() xmax = gccontentarray.max() # Due to the imshow sentence, we need to rescale gccontent from [0,1] to [0,100] ymin = coveragearray.min() ymax = coveragearray.max() # Perform a kernel density estimator on the results X, Y = mgrid[xmin:xmax:100j, ymin:ymax:100j] positions = c_[X.ravel(), Y.ravel()] values = c_[gccontentarray, coveragearray] kernel = stats.kde.gaussian_kde(values.T) Z = reshape(kernel(positions.T).T, X.T.shape) fig = pyplot.figure(figsize=(6,6)) ax = fig.add_subplot(111) sc=ax.imshow(rot90(Z),cmap=cm.gist_earth_r,extent=[xmin, 100, ymin, ymax], aspect="auto") # Due to the imshow sentence, we need to rescale gccontent from [0,1] to [0,100] cbar=fig.colorbar(sc,ticks=[numpy.min(Z),numpy.max(Z)]) cbar.ax.set_yticklabels(['Low','High']) cbar.set_label('Density') ax.set_xlabel('GC content (%)') ax.set_ylabel('Mean coverage') if(len(graphtitle)>25): ax.set_title(graphtitle[:25]+'...') else: ax.set_title(graphtitle) fig.savefig(fileout) matplotlib.pyplot.close(fig) if(status<>None): meanvalue = gccontentarray.mean() status.value = (meanvalue>=45 and meanvalue<=55) else: print 'WARNING: only one region found in the bed file. Skipping GC bias calculation.' if(executiongranted<>None): executiongranted.release()
def gcbias_lite(coveragefile, bedfilename, reference, fileout, graphtitle=None, executiongranted=None, status=None, bedTools=False): """************************************************************************************************************************************************************ Task: draws coverage as a function of gc content. IMPROVED VERSION of gcbias that avoids the use of bedtools (pybedtools) Input: coveragefile: string containing the full path of the bam.coverage file to analyze. This file has been built according to 1-base format bedfilename: target file -> assumes original-standard bed file reference: fasta file with reference genome fileout: string containing the full path of the bmp file where the restulting figure will be saved. bedTools: whether pybedtools are used instead of the own method Output: a png file will be created named "fileout" where a graph that compares gc content and mean coverage will be saved. ************************************************************************************************************************************************************""" if (executiongranted <> None): executiongranted.acquire() pid = str(os.getpid()) # print 'Processing '+coveragefile # print 'Results will be written at '+fileout coverage = region_coverage( coveragefile) # Calculate mean coverage per region ## fdw=file('regionCoverage.txt','w') ## for element in sorted(coverage.keys()): ## fdw.write(str(element)+'\n') ## fdw.close() if (len(coverage) > 1): if not bedTools: # Own method # print 'Own method' chromosomes = {} allKeys = coverage.keys() for currentKey in allKeys: chromosomes[currentKey[ 0]] = 1 # Stores all chromosomes to be examined (the ones contained in the target file) # Load BED file -> since coverage information is in 1-base format, BED format must be transformed to 1-base bed = bed_file.bed_file(bedfilename) sortedBed = bed.my_sort_bed() # Sort bed avoiding bedtools nonOverlappingBed = sortedBed.non_overlapping_exons( 1 ) # Base 1!!! # This generates a BED file in base 1 (Non-standard BED) finalBed = nonOverlappingBed.my_sort_bed( ) # BED file in base 1 (Non-standard BED) finalBed.load_custom( -1 ) # Load chromosome and positions in base 1....(finalBed is in base 1 -> Non-standard BED) #Load FASTA file fastaFile = file(reference, 'r') storeSequence = False wholeChromosome = '' currentChromosome = '' gccontent = {} for line in fastaFile: # Read each line of the fasta file if line.startswith( '>' ): # New chromosome starts -> reading a new line until another '>' is found # print 'Processing ' +line+'\n' if storeSequence: # a chromosome has been read run gc bias currentGCcontent = measureGCbias( wholeChromosome, currentChromosome, finalBed) gccontent.update(currentGCcontent) # Update dictionary storeSequence = False currentChromosome = re.split( ' +', line )[0] # Format: >1 dna:chromosome chromosome:GRCh37:1:1:249250621:1 currentChromosome = currentChromosome.split( '>')[1].strip() # Chromosome string if ( currentChromosome in chromosomes ): # If current chromosome read in the FASTA file is in the list of chromosomes in the BED file storeSequence = True wholeChromosome = '' # To store whole sequence for the current chromosome elif (not re.search('>', line) and storeSequence): wholeChromosome = wholeChromosome + line.rstrip( ) # Remove '\n' from current line and concatenates to wholeChromosome if (storeSequence): # For the last chromosome currentGCcontent = measureGCbias(wholeChromosome, currentChromosome, finalBed) gccontent.update(currentGCcontent) # Update dictionary fastaFile.close() region_ids = [] region_ids = coverage.keys() if (len(gccontent) == 0): print 'ERROR: G+C content values can not be calculated. Probably the provided reference file ' + reference + ' does not match with ' print ' the target file ' + bedfilename + '. That is, sequences of regions in the target file are probably not included within the' print ' reference file.' sys.exit(1) else: print 'Calculating nt content by means of pybedtools...' bed = bed_file.bed_file(bedfilename) sortedBed = bed.my_sort_bed() # Sort bed avoiding bedtools nonOverlappingBed = sortedBed.non_overlapping_exons( 1) # base one!!! finalBed = nonOverlappingBed.my_sort_bed() # BED file in base 1 bedfd = pybedtools.BedTool(finalBed.filename) bedfd = bedfd.remove_invalid( ) # Remove negative coordinates or features with length=0, which do not work with bedtools pybedtools._bedtools_installed = True pybedtools.set_bedtools_path(BEDTOOLSPATH) ntcontent = bedfd.nucleotide_content(reference) # Each entry in ntcontent is parsed to extract the gc content of each exon gccontent = {} for entry in ntcontent: gccontent[(entry.fields[0], string.atoi(entry.fields[1]), string.atoi(entry.fields[2]))] = string.atof( entry.fields[-8]) * 100 print ' Done.' # gccontent keys in dictionary: chromosome, exon init, exon end region_ids = [] for currentKey in coverage.keys( ): # Pybedtools does not work with regions with zero length -> remove them (there are a few of them) if currentKey[1] != currentKey[2]: region_ids.append(currentKey) ## ## fdw=file('gcContent.txt','w') ## for element in sorted(gccontent.keys()): ## fdw.write(str(element)+'\n') ## fdw.close() ## #region_ids = gccontent.keys() coveragearray = numpy.array([coverage[id] for id in region_ids]) gccontentarray = numpy.array([gccontent[id] for id in region_ids]) # Values in [0,1] # fig = pyplot.figure(figsize=(6,6)) # ax = fig.add_subplot(111) # # ax.hist(gccontentarray,bins=100) # fig.suptitle('Dsitribution of GC content regardless of coverage value') # ax.set_ylabel('Frequency') # ax.set_xlabel('GC content') # ax.set_xlim(0, 100) # fig.savefig('distribution.png') xmin = gccontentarray.min() xmax = gccontentarray.max( ) # Due to the imshow sentence, we need to rescale gccontent from [0,1] to [0,100] ymin = coveragearray.min() ymax = coveragearray.max() # Perform a kernel density estimator on the results X, Y = mgrid[xmin:xmax:100j, ymin:ymax:100j] positions = c_[X.ravel(), Y.ravel()] values = c_[gccontentarray, coveragearray] kernel = stats.kde.gaussian_kde(values.T) Z = reshape(kernel(positions.T).T, X.T.shape) fig = pyplot.figure(figsize=(6, 6)) ax = fig.add_subplot(111) sc = ax.imshow( rot90(Z), cmap=cm.gist_earth_r, extent=[xmin, 100, ymin, ymax], aspect="auto" ) # Due to the imshow sentence, we need to rescale gccontent from [0,1] to [0,100] cbar = fig.colorbar(sc, ticks=[numpy.min(Z), numpy.max(Z)]) cbar.ax.set_yticklabels(['Low', 'High']) cbar.set_label('Density') ax.set_xlabel('GC content (%)') ax.set_ylabel('Mean coverage') if (len(graphtitle) > 25): ax.set_title(graphtitle[:25] + '...') else: ax.set_title(graphtitle) fig.savefig(fileout) matplotlib.pyplot.close(fig) if (status <> None): meanvalue = gccontentarray.mean() status.value = (meanvalue >= 45 and meanvalue <= 55) else: print 'WARNING: only one region found in the bed file. Skipping GC bias calculation.' if (executiongranted <> None): executiongranted.release()
def getOffTarget(self,offset,coverageThreshold,target,outfile,tmpdir=None): """************************************************************************************************************************************************************ Task: selects off-tareget(+offset) regions with a coverage > coverageThreshold Inputs: offset: integer indicating the number of bases to extend the target. coverageThreshold: integer indicating the coverage threshold to select the region target: ROIs bed file Ouputs: a new bedgraph file will be created containing selected regions. ************************************************************************************************************************************************************""" pid = str(os.getpid()) tmpbed = tmpdir+'/'+pid+'.extended.bed' bed=bed_file.bed_file(target) extendedBed=bed.extendnoref(offset,tmpbed) sortedBed=extendedBed.my_sort_bed() nonOverlappingBed=sortedBed.non_overlapping_exons(-1) # Base 0, it is a standard BED finalBed=nonOverlappingBed.my_sort_bed() # BED file in base 0 finalBed.load_custom(-1) # Load chromosome and positions in base 0 bed_region=finalBed.get_region() bed_index=0 #index to control bed_region position fd=file(self.filename) header=fd.readline() reading=True #boolean to control while loop chr_found=False batch_n=1 fdw=file(outfile,'w') while reading: batch,fd=self.get_batch(fd, 10000000) # print batch_n batch_n=batch_n+1 if batch==[]: reading=False else: for line in batch: aline=line.replace('\n','').split(' ') #new region r=region.region(aline[0],aline[1],aline[2],aline[3]) search_open=True while search_open: type_overlap=r.overlap_type(bed_region[bed_index]) if type_overlap==0: #bed region comes before bedgraph region search_open=True if bed_index+1<len(bed_region) and (chr_found==False or (chr_found==True and r.chrom==bed_region[bed_index].chrom)): bed_index=bed_index+1 elif r.value>=coverageThreshold: search_open=False for region_selected in r-bed_region[bed_index]: fdw.write(str(region_selected)) else: search_open=False elif type_overlap==-1: #bed region comes after bedgraph region search_open=False chr_found=True if r.value>=coverageThreshold: for region_selected in r-bed_region[bed_index]: fdw.write(str(region_selected)) else: search_open=False chr_found=True if r.value>=coverageThreshold: for region_selected in r-bed_region[bed_index]: fdw.write(str(region_selected)) fd.close()