def bam_stats(bam_file): """ Wrapper for the pysam SAMtools flagstat function Parameters ---------- bam_file : str Location of the bam file Returns ------- list : dict qc_passed : int qc_failed : int description : str """ results = pysam.flagstat(bam_file) # pylint: disable=no-member separate_results = results.strip().split("\n") return [ { "qc_passed": int(element[0]), "qc_failed": int(element[2]), "description": " ".join(element[3:]) } for element in [row.split(" ") for row in separate_results] ]
def buildBAMStats( infile, outfile ): '''calculate bamfile statistics ''' # no bedToBigBed # to_cluster = True outs = open(outfile, "w" ) outs.write( "reads\tcategory\n" ) for line in pysam.flagstat( infile ): data = line[:-1].split( " ") outs.write( "%s\t%s\n" % (data[0], " ".join(data[1:]) ) ) pysam_in = pysam.Samfile( infile, "rb" ) outs_dupl = open( outfile + ".duplicates", "w" ) outs_dupl.write( "contig\tpos\tcounts\n" ) outs_hist = open( outfile + ".histogram", "w" ) outs_hist.write( "duplicates\tcounts\tcumul\tfreq\tcumul_freq\n" ) last_contig, last_pos = None, None ninput, nduplicates = 0, 0 duplicates = collections.defaultdict( int ) counts = collections.defaultdict( int ) count = 0 for read in pysam_in.fetch(): ninput += 1 if read.rname == last_contig and read.pos == last_pos: count += 1 nduplicates += 1 continue if count > 1: outs_dupl.write("%s\t%i\t%i\n" % (last_contig, last_pos, count) ) counts[count] += 1 count = 1 last_contig, last_pos = read.rname, read.pos outs.write("%i\tduplicates (%5.2f%%)\n" % (nduplicates, 100.0* nduplicates / ninput)) outs.write("%i\twithout duplicates (%5.2f%%)\n" % (ninput - nduplicates, 100.0*(ninput - nduplicates)/ninput)) pysam_in.close() outs.close() outs_dupl.close() keys = counts.keys() # count per position (not the same as nduplicates, which is # of reads) c = 0 total = sum( counts.values() ) for k in sorted(keys): c += counts[k] outs_hist.write("%i\t%i\t%i\t%f\t%f\n" % (k, counts[k], c, 100.0 * counts[k] / total, 100.0 * c / total) ) outs_hist.close()
def is_paired_sequencing(bamfile): # TODO: this is scary. Should check for unpaired being 0, and number paired == total number r = pysam.flagstat(bamfile) paired = int(r[5].split()[0]) if paired != 0: return True else: return False
def proc_sam(arg): samfile = arg[0] rmdup = arg[1] #se = arg[2] print samfile print rmdup sam_dir = "/".join(samfile.split("/")[:-1]) + "/" sam_prefix = os.path.basename(samfile).split(".sam")[0] mapped_sam = sam_dir + sam_prefix + "_mapped.sam" rmdup_sam = sam_dir + sam_prefix + "_rmdup.sam" sort_sam = sam_dir + sam_prefix + "_sort" if not os.path.exists(mapped_sam): print "Removing unmapped..." sam = pysam.Samfile(samfile, 'r') mb = pysam.Samfile(mapped_sam, 'w', template=sam) for read in sam: if not read.is_unmapped: mb.write(read) mb.close() print "Finished removing unmapped." if not os.path.exists(rmdup_sam) and rmdup == "True": print "Removing duplicates..." pysam.rmdup("-S", mapped_sam, rmdup_sam) os.remove(mapped_sam) print "Sorting..." pysam.sort(rmdup_sam, sort_sam) os.remove(rmdup_sam) else: print "Sorting..." pysam.sort(mapped_sam, sort_sam) os.remove(mapped_sam) print "Indexing..." sort_sam = sort_sam + ".sam" pysam.index(sort_sam) samfile_fs = open(samfile + "_stat", 'w') for line in pysam.flagstat(samfile): samfile_fs.write(line) samfile_fs.close sort_sam_fs = open(sort_sam + "_stat", 'w') for line in pysam.flagstat(sort_sam): sort_sam_fs.write(line) sort_sam_fs.close()
def __init__(self, bamname, outpath, window_size): self.bamname = bamname self.bamfile = pysam.Samfile(bamname, 'rb') self.outpath = outpath self.window_size = atoi(window_size) self.nreads = atoi(pysam.flagstat(bamname)[0].split()[0]) self.chr_lengths = self.bamfile.lengths self.chrs_queue = [] for index in range(self.bamfile.nreferences): self.chrs_queue.append((self.bamfile.references[index], self.bamfile.lengths[index]))
def calculateChromdata(samfile, ontarget): sys.stdout.write('\rFinalizing analysis ... 0.0%') sys.stdout.flush() chromdata = dict() chroms = samfile.references chromsres = [] alltotal = 0 allon = 0 alloff = 0 i = 0 for chrom in chroms: total = sum(1 for _ in samfile.fetch(chrom)) if 'chr' + chrom in ontarget.keys(): on = int(ontarget['chr' + chrom]) off = total - on else: on = 0 off = 0 chromsres.append({ 'CHROM': chrom, 'RC': total, 'RCIN': on, 'RCOUT': off }) alltotal += total allon += on alloff += off i += 1 x = round(100 * i / len(chroms), 1) x = min(x, 100.0) sys.stdout.write('\rFinalizing analysis ... ' + str(x) + '%') sys.stdout.flush() chromdata['Chroms'] = chromsres chromdata['Mapped'] = {'RC': alltotal, 'RCIN': allon, 'RCOUT': alloff} allreads = pysam.flagstat(options.input)[0] allreads = allreads[:allreads.find('+')] allreads = int(allreads.strip()) chromdata['Total'] = allreads chromdata['Unmapped'] = allreads - alltotal sys.stdout.write('\rFinalizing analysis ... 100.0% - Done') sys.stdout.flush() print '' return chromdata
def run_flagstat(bamfile): stats = {} for l in pysam.flagstat(bamfile): if 'QC-passed' in l: stats['QC-passed reads'] = l.rstrip().split(' ', 1)[0] elif 'mapped' in l: stats['% Mapped'] = perc_from_flagstat_line(l) elif 'properly paired' in l: stats['% Properly paired'] = perc_from_flagstat_line(l) break sys.stderr.write("{}\n".format(stats)) return stats
def get_samtools_flagstat(bam_file): ''' Runs samtools flagstat to get read metrics ''' logging.info('samtools flagstat...') results = pysam.flagstat(bam_file, split_lines=True) flagstat = '' for line in results: logging.info(line.strip()) flagstat += line if "mapped" in line and "mate" not in line: mapped_reads = int(line.split('+')[0].strip()) return flagstat, mapped_reads
def getBamReadMean(regions, bam_file, non_nan): # bam file should be samtools indexed samfile = pysam.AlignmentFile(bam_file, "rb") if os.path.isfile(bam_file + '.reads'): #total number of reads in a genome with open(bam_file + '.reads') as f: reads = int(f.readline().rstrip()) else: # if genomic reads not provided, normalize with the given bam file print 'Please wait, getting total number of (properly paired) mapped reads...' print 'assuming bam file is for whole genome region' flags = pysam.flagstat(bam_file) for flag in flags: if (flag.split(' ')[3] == 'properly' and flag.split(' ')[4] == 'paired'): reads = (int(flag.split(' ')[0]) + int(flag.split(' ')[2])) break reads = reads / 1000000.0 print reads, ' millions reads' Profile = [] if non_nan == 1: #average over non_nan region for region in regions: sumRegion = 0 sumNonNan = 0 iter = samfile.pileup(str(region.chrom), region.start, region.stop) for x in iter: if (x.reference_pos < region.stop) and (x.reference_pos >= region.start): #sumRegion+=x.nsegments #if not x.nsegments ==0: # sumNonNan+=1 npp = 0 for y in x.pileups: if y.alignment.is_proper_pair: # counting only properly paired sumRegion += 1 npp += 1 if npp > 0: sumNonNan += 1 Profile.append((sumRegion / reads) / sumNonNan) else: #average over whole region, default for region in regions: sumRegion = 0 iter = samfile.pileup(str(region.chrom), region.start, region.stop) for x in iter: if (x.reference_pos < region.stop) and (x.reference_pos >= region.start): #sumRegion+=x.nsegments for y in x.pileups: if y.alignment.is_proper_pair: sumRegion += 1 Profile.append((sumRegion / reads) / (region.stop - region.start)) return Profile
def calculateChromdata(samfile, ontarget): sys.stdout.write('\rFinalizing analysis ... 0.0%') sys.stdout.flush() chromdata = dict() chroms = samfile.references chromsres = [] alltotal = 0 allon = 0 alloff = 0 i = 0 for chrom in chroms: total = sum(1 for _ in samfile.fetch(chrom)) if 'chr' + chrom in ontarget.keys(): on = int(ontarget['chr' + chrom]) off = total - on else: on = 0 off = 0 chromsres.append({'CHROM': chrom, 'RC': total, 'RCIN': on, 'RCOUT': off}) alltotal += total allon += on alloff += off i += 1 x = round(100 * i / len(chroms), 1) x = min(x, 100.0) sys.stdout.write('\rFinalizing analysis ... ' + str(x) + '%') sys.stdout.flush() chromdata['Chroms'] = chromsres chromdata['Mapped'] = {'RC': alltotal, 'RCIN': allon, 'RCOUT': alloff} allreads = pysam.flagstat(options.input)[0] allreads = allreads[:allreads.find('+')] allreads = int(allreads.strip()) chromdata['Total'] = allreads chromdata['Unmapped'] = allreads - alltotal sys.stdout.write('\rFinalizing analysis ... 100.0% - Done') sys.stdout.flush() print '' return chromdata
def getBamReadMean(regions, bam_file, non_nan): # bam file should be samtools indexed samfile=pysam.AlignmentFile(bam_file,"rb") if os.path.isfile(bam_file+'.reads'): #total number of reads in a genome with open(bam_file+'.reads') as f: reads=int(f.readline().rstrip()) else: # if genomic reads not provided, normalize with the given bam file print 'Please wait, getting total number of (properly paired) mapped reads...' print 'assuming bam file is for whole genome region' flags=pysam.flagstat(bam_file) for flag in flags: if (flag.split(' ')[3]=='properly' and flag.split(' ')[4]=='paired'): reads=(int(flag.split(' ')[0])+int(flag.split(' ')[2])) break reads=reads/1000000.0 print reads, ' millions reads' Profile=[] if non_nan ==1: #average over non_nan region for region in regions: sumRegion=0 sumNonNan=0 iter=samfile.pileup(str(region.chrom),region.start, region.stop) for x in iter: if (x.reference_pos<region.stop) and (x.reference_pos>=region.start): #sumRegion+=x.nsegments #if not x.nsegments ==0: # sumNonNan+=1 npp=0 for y in x.pileups: if y.alignment.is_proper_pair: # counting only properly paired sumRegion+=1 npp+=1 if npp>0: sumNonNan+=1 Profile.append((sumRegion/reads)/sumNonNan) else: #average over whole region, default for region in regions: sumRegion=0 iter=samfile.pileup(str(region.chrom),region.start, region.stop) for x in iter: if (x.reference_pos<region.stop) and (x.reference_pos>=region.start): #sumRegion+=x.nsegments for y in x.pileups: if y.alignment.is_proper_pair: sumRegion+=1 Profile.append((sumRegion/reads)/(region.stop-region.start)) return Profile
def calculateChromdata_minimal(samfile): print '' sys.stdout.write('\rRunning analysis ... 0.0%') sys.stdout.flush() chromdata = dict() chroms = samfile.references chromsres = [] alltotal = 0 allon = 0 alloff = 0 i = 0 for chrom in chroms: total = sum(1 for _ in samfile.fetch(chrom)) chromsres.append({'CHROM': chrom, 'RC': total}) alltotal += total i += 1 x = round(100 * i / len(chroms), 1) x = min(x, 100.0) sys.stdout.write('\rRunning analysis ... ' + str(x) + '%') sys.stdout.flush() chromdata['Chroms'] = chromsres chromdata['Mapped'] = {'RC': alltotal} allreads = pysam.flagstat(options.input)[0] allreads = allreads[:allreads.find('+')] allreads = int(allreads.strip()) chromdata['Total'] = allreads chromdata['Unmapped'] = allreads - alltotal sys.stdout.write('\rRunning analysis ... 100.0% - Done') sys.stdout.flush() print '' return chromdata
def proc(arg): bamfile = arg[0] rmdup = arg[1] errorlog = arg[2] if errorlog == "stderr": errorlog = sys.stderr if rmdup == "False": rmdup = False bam_dir = "/".join(bamfile.split("/")[:-1]) + "/" bam_prefix = os.path.basename(bamfile).split(".bam")[0] mapped_bam = bam_dir + bam_prefix + "_mapped.bam" rmdup_bam = bam_dir + bam_prefix + "_rmdup.bam" sort_bam = bam_dir + bam_prefix + "_sort" stat_dir = bam_dir + "stat/" if not os.path.exists(stat_dir): os.makedirs(stat_dir) if not os.path.exists(mapped_bam): print>>errorlog, "Removing unmapped..." mapped = 0 unmapped = 0 bam = pysam.Samfile(bamfile, 'rb') mb = pysam.Samfile(mapped_bam, 'wb', template=bam) try: for read in bam: if not read.is_unmapped: mapped = mapped + 1 mb.write(read) else: unmapped = unmapped + 1 except: errorlog.write("Failed to remove unmapped reads: read number {0}\n".format(mapped + unmapped)) raise else: errorlog.write("Unmapped read removal successful: Mapped {0}/Unmapped {1}\n".format(mapped, unmapped)) bam.close() mb.close() if not os.path.exists(sort_bam + ".bam"): print>>errorlog, "Sorting..." try: cmd_args = ['java', '-Xmx2g', '-jar', '/seq/picard/SortSam.jar', "=".join(["INPUT", mapped_bam]), "=".join(["OUTPUT", sort_bam + ".bam"]), "=".join(["SORT_ORDER", "coordinate"])] p = Popen(cmd_args, stdout=errorlog, stderr=errorlog) p.wait() except: errorlog.write("Sorting failed.\n") raise else: os.remove(mapped_bam) if not os.path.exists(rmdup_bam) and rmdup: print "Removing duplicates..." rmdup_metrics = stat_dir + bam_prefix + "_rmdup_metrics" cmd_args = ['java', '-Xmx2g', '-jar', '/seq/picard/MarkDuplicates.jar', "=".join(["INPUT", sort_bam + ".bam"]), "=".join(["OUTPUT", rmdup_bam]), "=".join(["METRICS_FILE", rmdup_metrics]), "=".join(["REMOVE_DUPLICATES", "true"]), "=".join(["ASSUME_SORTED", "true"])] try: p = Popen(cmd_args, stdout=errorlog, stderr=errorlog) p.wait() except: errorlog.write("Failed to remove duplicates.\n") raise try: print>>errorlog, "Indexing..." pysam.index(rmdup_bam) except SamtoolsError as detail: print>>errorlog, "Indexing failed: ",detail else: try: print>>errorlog, "Indexing..." sort_bam = sort_bam + ".bam" pysam.index(sort_bam) except SamtoolsError as detail: print>>errorlog, "Indexing failed: ", detail bamfile_fs = open(bam_dir + "stat/" + bam_prefix + "_stat", 'w') for line in pysam.flagstat(bamfile): bamfile_fs.write(line) bamfile_fs.close() return 0
def get_coverage(bamfile, match_string): """Get total reads using samtools flagstat """ o = pysam.flagstat(bamfile) total_reads = int([s for s in o.split('\n') if match_string in s][0].split()[0]) return total_reads
def testWithoutRedirectedStdout(self): r = pysam.flagstat(os.path.join(DATADIR, "ex1.bam"), catch_stdout=False) self.assertTrue(len(r) == 0)
def testWithRedirectedStdout(self): r = pysam.flagstat(os.path.join(DATADIR, "ex1.bam")) self.assertTrue(len(r) > 0)
def main(): """The main function""" current_dir = os.getcwd() #mypath = current_dir + "/bams/" mypath = current_dir + "/alignments/" #bed_file = "/vlsci/UOM0040/shared/djp/rover_file/crc/CRC_10g_23May16.final.rover.bed" fastq_dir = current_dir + "/fastqs/" config_file = "pipeline.config" with open(config_file, 'r') as stream: try: bed_file = yaml.load(stream)['target_bed'] except yaml.YAMLError as exc: print("Error with config file: " + exc) onlyfiles = [] for root, dirs, files in os.walk(mypath): for file in files: if file.endswith(".primary.primerclipped.bam"): current_file = mypath + str(file) onlyfiles.append(os.path.join(root, file)) #onlyfiles = [files for files in listdir(mypath) if (isfile(join(mypath, files)) and (files.endswith('.primary.primerclipped.bam')))] #file_paths = [join(mypath,files) for files in listdir(mypath) if (isfile(join(mypath, files)) and (files.endswith('.bam')))] #onlyfiles = [files for files in listdir(mypath) if (files.endswith(''))] #print onlyfiles #print len(onlyfiles) # stats list header = '\t'.join([ 'Sample_ID', 'Total_fastq_reads', 'Primary_reads', 'Reads_mapping_to_genome' , 'Reads_mapping_to_target', 'Percent_reads_mapping_to_genome', 'Percent_reads_mapping_to_target', 'Average_depth', \ 'Percent_target_not_covered', 'Percent_target_covered_at_<10X', 'Percent_target_covered_at_10X', 'Percent_target_covered_at_20X', 'Percent_target_covered_at_50X', 'Median_depth', \ 'Percent_target_covered_at_median', \ 'Percent_target_covered_at_median_10_fold', 'Percent_target_covered_at_median_20_fold', 'Percent_target_covered_at_median_30_fold', \ 'Percent_target_covered_at_median_40_fold', 'Percent_target_covered_at_median_50_fold']) #, 'Percent_target_covered_at_q50', \ #'Percent_target_covered_at_q60', 'Percent_target_covered_at_q70', 'Percent_target_covered_at_q80']) #header = "Sample\tTotal_reads\tMapped_reads" print header for bam_file in onlyfiles: current_bam_file = join(mypath, bam_file) temp_bam_file = os.path.basename(current_bam_file) sample = temp_bam_file.replace(".primary.primerclipped.bam", "") fastq1 = fastq_dir + sample + "_L01_R1_001.fastq" fastq2 = fastq_dir + sample + "_L01_R2_001.fastq" fastq1_lc = int( subprocess.check_output(["wc", "-l", fastq1]).lstrip(' ').split(' ')[0]) fastq2_lc = int( subprocess.check_output(["wc", "-l", fastq2]).lstrip(' ').split(' ')[0]) total_fastq_lines = fastq1_lc + fastq2_lc total_fastq_reads = total_fastq_lines / 4 flagstats = pysam.flagstat(current_bam_file) all_reads = int(flagstats.split('\n')[0].split('+')[0]) reads_mapping_to_genome = int(flagstats.split('\n')[5].split('+')[0]) x = pybedtools.example_bedtool(current_bam_file) b = pybedtools.example_bedtool(bed_file) y = x.intersect(b).moveto(join(mypath, 'temp.bam')) c = b.coverage(x) average_depth = calculate_average_depth(c) median_depth = calculate_median_depth(c) percent_target_not_covered = calculate_zero_depth_intervals(c, 0) percent_target_covered_at_L10X = calculate_zero_depth_intervals(c, 10) percent_target_covered_at_10X = calculate_x_depth_intervals(c, 10) percent_target_covered_at_20X = calculate_x_depth_intervals(c, 20) percent_target_covered_at_50X = calculate_x_depth_intervals(c, 50) percent_target_covered_at_median = calculate_x_depth_intervals_folds( c, median_depth, median_depth) # Using percentage from median #percent_target_covered_at_median_X10 = calculate_x_depth_intervals_folds(c, (median_depth - median_depth * (10.0/100)), (median_depth + median_depth * (10.0/100))) ''' percent_target_covered_at_median_10_fold = calculate_x_depth_intervals_folds(c, (median_depth - (median_depth * 0.10)), (median_depth + (median_depth * 0.10))) percent_target_covered_at_median_20_fold = calculate_x_depth_intervals_folds(c, (median_depth - (median_depth * 0.20)), (median_depth + (median_depth * 0.20))) percent_target_covered_at_median_50_fold = calculate_x_depth_intervals_folds(c, (median_depth - (median_depth * 0.50)), (median_depth + (median_depth * 0.50))) percent_target_covered_at_median_60_fold = calculate_x_depth_intervals_folds(c, (median_depth - (median_depth * 0.60)), (median_depth + (median_depth * 0.60))) percent_target_covered_at_median_70_fold = calculate_x_depth_intervals_folds(c, (median_depth - (median_depth * 0.70)), (median_depth + (median_depth * 0.70))) percent_target_covered_at_median_80_fold = calculate_x_depth_intervals_folds(c, (median_depth - (median_depth * 0.80)), (median_depth + (median_depth * 0.80))) ''' percent_target_covered_at_median_10_fold = calculate_x_depth_intervals( c, (median_depth / (10))) percent_target_covered_at_median_20_fold = calculate_x_depth_intervals( c, (median_depth / (20))) percent_target_covered_at_median_30_fold = calculate_x_depth_intervals( c, (median_depth / (30))) percent_target_covered_at_median_40_fold = calculate_x_depth_intervals( c, (median_depth / (40))) percent_target_covered_at_median_50_fold = calculate_x_depth_intervals( c, (median_depth / (50))) stats_temp = pysam.flagstat(join(mypath, 'temp.bam')) on_target_reads = int(stats_temp.split('\n')[0].split('+')[0]) reads_mapping_to_target = int(stats_temp.split('\n')[5].split('+')[0]) #percent_reads_mapping_to_genome = ((reads_mapping_to_genome * 1.0)/all_reads)*100.0 percent_reads_mapping_to_genome = ( (reads_mapping_to_genome * 1.0) / total_fastq_reads) * 100.0 #percent_reads_mapping_to_target = ((reads_mapping_to_target * 1.0)/on_target_reads)*100.0 percent_reads_mapping_to_target = ( (reads_mapping_to_target * 1.0) / total_fastq_reads) * 100.0 os.remove(join(mypath, 'temp.bam')) print("%s\t%d\t%d\t%d\t%d\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f" % (sample, total_fastq_reads, all_reads, reads_mapping_to_genome, reads_mapping_to_target, \ percent_reads_mapping_to_genome, percent_reads_mapping_to_target, average_depth, \ percent_target_not_covered, percent_target_covered_at_L10X, percent_target_covered_at_10X, percent_target_covered_at_20X, percent_target_covered_at_50X, \ median_depth, percent_target_covered_at_median, \ percent_target_covered_at_median_10_fold, percent_target_covered_at_median_20_fold, percent_target_covered_at_median_30_fold, \ percent_target_covered_at_median_40_fold, percent_target_covered_at_median_50_fold))
def main(): # Argument parsing parser = argparse.ArgumentParser(description="Calculate useful stats for " "capture-based experiments.") parser.add_argument( "bam_file", help="Input BAM file (sorted, rmduped and indexed). " "Assumes that the BAM file contains reads of one length.") parser.add_argument("bed_file", help="BED file listing target intervals.") parser.add_argument("--output", help="Output TSV stats file.") parser.add_argument( "--exclude", help="BED file listing regions to be excluded. " "Useful for pooled libraries sharing the same multiplex index.") args = parser.parse_args() # Initialize variables inbam = pysam.AlignmentFile(args.bam_file, "rb") inbed = cancer_api.files.BedFile.open(args.bed_file) excl_bed = cancer_api.files.BedFile.open( args.exclude) if args.exclude else None output = args.output if args.output else inbam.filename + ".capture_stats.tsv" stats = OrderedDict() # Define helper functions def count_interval_reads(bam_file, interval, viewed_reads={}): """Counts the number of reads in a given interval in a BAM file. Adds viewed reads to dictionary to avoid double-counting of reads. """ reads_iterator = inbam.fetch(interval.chrom, interval.start_pos, interval.end_pos) read_qnames = [ r.query_name for r in reads_iterator if r.query_name not in viewed_reads and not r.is_duplicate ] num_reads = len(read_qnames) for qname in read_qnames: viewed_reads[qname] = 1 return num_reads # Calculate overall coverage read_length = inbam.head(1).next().query_length # Using flagstat in order to not count duplicate reads total_index = 0 dups_index = 3 flagstat_regex = r"(\d+).*" flagstat = pysam.flagstat(inbam.filename) num_total = int(re.match(flagstat_regex, flagstat[total_index]).group(1)) num_dups = int(re.match(flagstat_regex, flagstat[dups_index]).group(1)) genome_num_mapped = num_total - num_dups # Calculate genome length genome_length = sum(inbam.lengths) if excl_bed: # Correct for excluded regions excl_length = 0 excl_num_mapped = 0 for interval in excl_bed: excl_length += interval.length excl_num_mapped += count_interval_reads(inbam, interval) genome_num_mapped -= excl_num_mapped genome_length -= excl_length genome_cov = read_length * genome_num_mapped / float(genome_length) stats["Genome_Coverage"] = round(genome_cov, 3) # Calculate on-target coverage target_length = 0 target_num_mapped = 0 viewed_reads = {} for interval in inbed: target_length += interval.length target_num_mapped += count_interval_reads(inbam, interval, viewed_reads) target_cov = read_length * target_num_mapped / float(target_length) stats["Target_Coverage"] = round(target_cov, 3) # Calculate percent on-target percent_on_target = target_num_mapped / float(genome_num_mapped) * 100 stats["Percent_Reads_On_Target"] = round(percent_on_target, 3) # Calculate percent fold enrichment percent_fold_enrichment = target_cov / genome_cov * 100 stats["Percent_Fold_Enrichment"] = round(percent_fold_enrichment, 3) # Write out stats to file with open(output, "w") as outfile: for k, v in stats.items(): stats[k] = str(v) outfile.write("\t".join(stats.keys()) + "\n") outfile.write("\t".join(stats.values()) + "\n") # Cleanup inbam.close()
default=872674, help='Seed for the sampler.') args = parser.parse_args() if args.number_reads and args.same_depth: logging.warning("--number-reads was passed; ignoring --same-depth") args.same_depth = False if args.number_reads is None and not args.same_depth: logging.error("--number-reads or --same-depth must be passed") sys.exit(1) # Determine the number of QC-passed reads in each of the bam files logging.info("Determining the number of QC-passed reads in each file") flagstats = [pysam.flagstat(x).split('\n')[0] for x in args.bam] number_reads = [ int( re.match( '\A(\d+) \+ (\d+) in total \(QC-passed reads \+ QC-failed reads\)\Z', i).group(1)) for i in flagstats ] for index, bam in enumerate(args.bam): logging.info("Found {} reads in {}".format(number_reads[index], bam)) # Now perform the subsampling for index, bam in enumerate(args.bam): bam_prefix = re.match('\A(.*).bam\Z', os.path.basename(bam)).group(1) subsample_fraction = None
def main(options): """ Runs all analysies one thing to do is make graphs fail gracefully """ #from subprocess import Popen, PIPE #host = Popen(["hostname"], stdout=PIPE).communicate()[0].strip() #gets clusters in a bed tools + names species clusters = options.clusters species = options.species clusters_bed = pybedtools.BedTool(clusters) #makes output file names clusters = str.replace(clusters, ".BED", "") options.k = map(int, options.k) outdir = options.outdir #sets up output dirs make_dir(outdir) assigned_dir = os.path.join(outdir, "assigned") misc_dir = os.path.join(outdir, "misc") fastadir = os.path.join(outdir, "fasta") kmerout = os.path.join(outdir, "kmer") homerout_base = os.path.join(outdir, "homer") make_dir(homerout_base) homerout = os.path.join(homerout_base, clusters) make_dir(assigned_dir) make_dir(misc_dir) make_dir(fastadir) make_dir(homerout) make_dir(kmerout) all_regions = (["all", "exon", "UTR3", "UTR5", "proxintron", "distintron"]) #Not quite sure whats going on here, but its one logical block #either reassigns clusters to genic regions or reads from already #made assigned lists if options.assign is False: try: cluster_regions, sizes, Gsizes = build_assigned_from_existing( assigned_dir, clusters, all_regions, options.nrand) print "I used a pre-assigned set of BED files... score!" except: print "I had problems retreiving region-assigned BED files from %s, i'll rebuild" % ( assigned_dir) options.assign = True if options.assign is True: print "Assigning Clusters to Genic Regions" cluster_regions, sizes, Gsizes = assign_to_regions( clusters_bed, options.genome_location, options.regions_location, species=species, getseq=True, nrand=options.nrand) print "Done Assigning" print "Saving BED and Fasta Files...", #outputs little files (maybe move inside of assign to regions) sizes_out = open( os.path.join(assigned_dir, "%s.sizes.pickle" % (clusters)), 'w') pickle.dump(sizes, file=sizes_out) sizes_out.close() Gsizes_out = open(os.path.join(assigned_dir, "Gsizes.pickle"), 'w') pickle.dump(Gsizes, file=Gsizes_out) Gsizes_out.close() #this is where all saving happens for assign to regions for region in all_regions: of = clusters + "." + region + ".real.BED" try: cluster_regions[region]['real'].saveas( os.path.join(assigned_dir, of)) except: continue for n in range(options.nrand): of = clusters + "." + region + ".rand." + str(n) + ".BED" try: cluster_regions[region]['rand'][n].saveas( os.path.join(assigned_dir, of)) except: continue print "done" #creates pretty file names for all regions for region in all_regions: try: real_fa = fa_file(clusters, region=region, fd=fastadir, type="real") rand_fa = fa_file(clusters, region=region, fd=fastadir, type="random") cluster_regions[region]['real'].save_seqs(real_fa) l = list() #list of randoms for n in cluster_regions[region]['rand'].keys(): l.append(cluster_regions[region]['rand'][n]) write_seqs(rand_fa, l) except: continue print "Counting reads in clusters...", #generates data for figure 1 and 2 #gets reads in clusters (figure 1) #gets reads per cluster (figure 2) reads_in_clusters = 0 reads_per_cluster = list() for cluster in cluster_regions['all']['real']: chr, start, stop, name, score, strand, tstart, tstop = str( cluster).strip().split("\t") try: gene, n, reads = name.split("_") except: try: gene, n, reads = name.split(";")[0].split("_") except: pass if int(reads) > 1: reads_per_cluster.append(int(reads)) reads_in_clusters += int(reads) print "done" #need to get rid of this pickleing busniess, its a waste of space and doesn't work with other methods #gets total number of reads (figure 1) #gets total number of reads from clipper analysis (Need to make clipper automatically output #pickle file print "Getting total number of reads...", total_reads = 0 try: pickle_file = clusters + ".pickle" if os.path.exists(pickle_file): pf = pickle.load(open(pickle_file, 'rb')) else: print "Couldn't find %s" % (pickle_file) print "Found %s" % (pickle_file) for gene in pf: total_reads += gene['nreads'] #if clipper didn't output gets it from flagstat except: print "Couldn't find a pickled file, resorting to flagstat for total reads. (this includes intergenic reads)" flagstats = pysam.flagstat(options.bam) total_reads = int(flagstats[2].split(" ")[0]) print "done, there were %d" % (total_reads) print "Gathering bed lengths...", #one stat is just generated here #generates cluster lengths (figure 3) cluster_lengths = bedlengths(cluster_regions['all']['real']) print "done" ##This should be abstracted to some sort of list or something... #figures 5 and 6, builds pre-mrna, mrna exon and intron distributions mRNA_positions = list() premRNA_positions = list() intron_positions = list() exon_positions = list() #also builds figure 10 (exon distances) GENES, Gtypes = build_AS_STRUCTURE_dict(species, options.as_structure) types = {} for type in ["CE:", "SE:", "MXE:", "A5E:", "A3E:"]: types[type] = 0 print "locating clusters within genes", try: #counts nearest exon to peak and gets RNA #gets rna positon for every line as well for line in (cluster_regions['all']['real']): mRNA_frac, premRNA_frac, exon_frac, intron_frac, nearest_type = RNA_position( line, GENES) if mRNA_frac is not None: mRNA_positions.append(mRNA_frac) if exon_frac is not None: exon_positions.append(exon_frac) if premRNA_frac is not None: premRNA_positions.append(premRNA_frac) if intron_frac is not None: intron_positions.append(intron_frac) if nearest_type is not None: try: types[nearest_type] += 1 except: types[nearest_type] = 1 except: print "there were errors, skipping" print "done" #gtypes is total genomic content #types is what clusters are #generates figure 10 (exon distances) type_count = [ types["CE:"], types["SE:"], types["MXE:"], types["A5E:"], types["A3E:"] ] Gtype_count = [ Gtypes["CE:"], Gtypes["SE:"], Gtypes["MXE:"], Gtypes["A5E:"], Gtypes["A3E:"] ] ### write fasta files and run homer and/or kmer analysis if at least one analysis is requested #runs kmer and homer analysis kmer_results = {} if options.reMotif is True: for region in all_regions: #reads nicely named files real_fa = fa_file(clusters, region=region, fd=fastadir, type="real") rand_fa = fa_file(clusters, region=region, fd=fastadir, type="random") if options.k is not None: if options.homer is True: region_homer_out = os.path.join(homerout, region) run_homer(real_fa, rand_fa, options.k, outloc=region_homer_out) for k in options.k: kmer_results[k] = {} kmer_results[k][region] = kmerdiff(real_fa, rand_fa, k) kmerfile = clusters + ".k" + str( k) + "." + region + ".kmerdiff" kmerfile = os.path.join(kmerout, kmerfile) kmer_sorted_output = run_kmerdiff(real_fa, rand_fa, outfile=kmerfile, k=k) #all the different motifs that the user specifices motifs = list(options.motif) kmer_box_params = [kmerout, clusters, options.k, motifs] ###conservation --should use multiprocessing to speed this part up! #start of conservation logic, very slow... phast_values = list() #loads phastcons values of generates them again if options.rePhast is False: try: phast_values = pickle.load( open(os.path.join(misc_dir, "%s.phast.pickle" % (clusters)))) except: options.rePhast = True #generates again if options.rePhast is True: print "Fetching Phastcons Scores...", #phastcons values for all regions except "all" for region in all_regions[1:]: #skip "all" combine them later print("%s..." % (region)), try: samplesize = 1000 #because it takes so long to fetch only select 1000 of them, not actually #implemented if len(cluster_regions[region]['real']) > samplesize: R1 = cluster_regions[region]['real'] # R1 = random.sample(cluster_regions[region]['real'], samplesize) else: R1 = cluster_regions[region]['real'] #realPhast = get_phastcons(cluster_regions[region]['real'], species=options.species) print "getting real...", #gets phastcons values real regions realPhast = get_phastcons(R1, options.phastcons_location, species=options.species) randPhast = list() #logic for random stuff (could be precomputed) for i in range(options.nrand): if len(cluster_regions[region]['rand'][i]) > samplesize: R2 = cluster_regions[region]['rand'][i] #R2 = random.sample(cluster_regions[region]['rand'][i], samplesize) else: R2 = cluster_regions[region]['rand'][i] print("getting rand %d" % (i)), randPhast.extend( get_phastcons(R2, options.phastcons_location, species=options.species).tolist()) #list of lists for real and random for every genic region phast_values.append(realPhast) phast_values.append(randPhast) except: continue #hacky selection of real values from phast_values all_real = np.concatenate(phast_values[::2]) #hacky selection of random values from phast_values all_rand = np.concatenate(phast_values[1::2]) #adds back in all and rand to phast_values list phast_values.insert(0, all_rand) phast_values.insert(0, all_real) pickout = open(os.path.join(misc_dir, "%s.phast.pickle" % (clusters)), 'w') pickle.dump(phast_values, file=pickout) Zscores = None #old. remove #build qc figure QCfig_params = [ reads_in_clusters, (total_reads - reads_in_clusters), cluster_lengths, reads_per_cluster, premRNA_positions, mRNA_positions, exon_positions, intron_positions, Gsizes, sizes, Gtype_count, type_count, Zscores, homerout, kmer_box_params, phast_values ] #save results pickout = open( os.path.join(outdir, "misc", "%s.qcfig_params.pickle" % (clusters)), 'w') pickle.dump(QCfig_params, file=pickout) QCfig = CLIP_Analysis_Display.CLIP_QC_figure(*QCfig_params) fn = clusters + ".QCfig.pdf" outFig = os.path.join(outdir, fn) #TODO Fix output of file (Don't know why its crashing right now print >> sys.stderr, outFig QCfig.savefig(outFig) ### does something with motifs doesn't appear to work right now #reads in existing precompiled motif file motifs = list(options.motif) if motifs is not None and False: #TODO hack to get stuff compiling fix soon motifBASE = options.motif_location fig = pylab.figure(figsize=(8.5, 11)) colors = [ "red", "orange", "green", "blue", "purple", "brown", "black", "pink", "gray", "cyan", "magenta" ] for i, motif in enumerate(motifs): mf = "motif_" + motif + ".BED" mfgz = "motif_" + motif + ".BED.gz" print os.path.join(motifBASE, species, mf) motifFILE = None if os.path.exists(os.path.join(motifBASE, species, mf)): motifFILE = os.path.join(motifBASE, species, mf) elif os.path.exists(os.path.join(motifBASE, species, mfgz)): motifFILE = os.path.join(motifBASE, species, mfgz) else: print "MOTIF BED FILE for motif: %s is not available, please build it" % ( mf) continue #plots motif distance from the precompiled file to the clusters plot_motif_dist(cluster_regions, motifFILE, fig, color=colors[i], species=species, slopsize=200) pylab.savefig(clusters + ".motif_distribution.pdf")
def main(): # Argument parsing parser = argparse.ArgumentParser(description="Calculate useful stats for " "capture-based experiments.") parser.add_argument("bam_file", help="Input BAM file (sorted, rmduped and indexed). " "Assumes that the BAM file contains reads of one length.") parser.add_argument("bed_file", help="BED file listing target intervals.") parser.add_argument("--output", help="Output TSV stats file.") parser.add_argument("--exclude", help="BED file listing regions to be excluded. " "Useful for pooled libraries sharing the same multiplex index.") args = parser.parse_args() # Initialize variables inbam = pysam.AlignmentFile(args.bam_file, "rb") inbed = cancer_api.files.BedFile.open(args.bed_file) excl_bed = cancer_api.files.BedFile.open(args.exclude) if args.exclude else None output = args.output if args.output else inbam.filename + ".capture_stats.tsv" stats = OrderedDict() # Define helper functions def count_interval_reads(bam_file, interval, viewed_reads={}): """Counts the number of reads in a given interval in a BAM file. Adds viewed reads to dictionary to avoid double-counting of reads. """ reads_iterator = inbam.fetch(interval.chrom, interval.start_pos, interval.end_pos) read_qnames = [r.query_name for r in reads_iterator if r.query_name not in viewed_reads and not r.is_duplicate] num_reads = len(read_qnames) for qname in read_qnames: viewed_reads[qname] = 1 return num_reads # Calculate overall coverage read_length = inbam.head(1).next().query_length # Using flagstat in order to not count duplicate reads total_index = 0 dups_index = 3 flagstat_regex = r"(\d+).*" flagstat = pysam.flagstat(inbam.filename) num_total = int(re.match(flagstat_regex, flagstat[total_index]).group(1)) num_dups = int(re.match(flagstat_regex, flagstat[dups_index]).group(1)) genome_num_mapped = num_total - num_dups # Calculate genome length genome_length = sum(inbam.lengths) if excl_bed: # Correct for excluded regions excl_length = 0 excl_num_mapped = 0 for interval in excl_bed: excl_length += interval.length excl_num_mapped += count_interval_reads(inbam, interval) genome_num_mapped -= excl_num_mapped genome_length -= excl_length genome_cov = read_length * genome_num_mapped / float(genome_length) stats["Genome_Coverage"] = round(genome_cov, 3) # Calculate on-target coverage target_length = 0 target_num_mapped = 0 viewed_reads = {} for interval in inbed: target_length += interval.length target_num_mapped += count_interval_reads(inbam, interval, viewed_reads) target_cov = read_length * target_num_mapped / float(target_length) stats["Target_Coverage"] = round(target_cov, 3) # Calculate percent on-target percent_on_target = target_num_mapped / float(genome_num_mapped) * 100 stats["Percent_Reads_On_Target"] = round(percent_on_target, 3) # Calculate percent fold enrichment percent_fold_enrichment = target_cov / genome_cov * 100 stats["Percent_Fold_Enrichment"] = round(percent_fold_enrichment, 3) # Write out stats to file with open(output, "w") as outfile: for k, v in stats.items(): stats[k] = str(v) outfile.write("\t".join(stats.keys()) + "\n") outfile.write("\t".join(stats.values()) + "\n") # Cleanup inbam.close()
def has_reads_mapped(bam): return int(pysam.flagstat(bam).split('\n')[4][0]) > 0
def main(): """Inputs: -- Expression output table file -- Transcript/gene symbol containing column number -- Reads per Tx/gene containing column number -- Bam file -- List of transcript/gene symbols -- Name of condition/sample Outputs: -- table of transcript/gene symbols, condition/sample name,original read count, and read counts normalized to each million mapped reads.""" usage = """python %prog [options]""" parser = optparse.OptionParser(usage=usage) parser.add_option('-t', type="string", default=False, help="""Name of a table file containing Tx/Gene symbols and respective read counts for at least one condition/sample. (default=%default)""") parser.add_option('-c', type='string', default=False, help="""Name of the condition/sample. (default=%default)""") parser.add_option('--tx-col', dest="tx_col", type='string', default=False, help="""The column number containing Tx/Gene symbols. (default=%default)""") parser.add_option('--rd-col', dest="rd_col", type='string', default=False, help="""The column number containing read counts. (default=%default)""") parser.add_option('-b', type='string', default=False, help="""Path to the bam file representing the aligned reads for the desired condition/sample. (default=%default)""") parser.add_option('-l', type='string', default="All", help="""Quoted, comma-delim'd list of transcript/gene symbols. (default=%default)""") (opts, args) = parser.parse_args() if len(sys.argv) == 1: parser.print_help() exit(1) if not opts.l == "All": opts.l = opts.l.split(',') else: opts.l = '' opts.tx_col = int(opts.tx_col) opts.rd_col = int(opts.rd_col) # get million mapped reads milMapped = float(pysam.flagstat(opts.b)[3].split()[0])/1000000 #milMapped = 7.5 # for quick debugging to bybass the bam filtering # parse list of tx/gene symbols into dict for tracking successes ##symbols = {} ##if not opts.l == "All": ##for sym in opts.l: ##symbols[sym] = [] # open expression table into rows rows = csv.reader(open(opts.t),delimiter='\t') # for each tx/gene in expFile: output million mapped reads (MMR) # if row[opts.tx_col].startswith(<any of the requested symbols>) # and update symbols with True if a hit is found. print "Tx_symbol\tCondition\tOriginal_reads\tReads_per_million_mapped" for row in rows: if row[opts.tx_col].startswith(tuple(opts.l)): try: print "%s\t%s\t%s\t%s" % (row[opts.tx_col], opts.c, row[opts.rd_col], float(row[opts.rd_col])/milMapped) except: print "failed: %s" % (';'.join(row))
def main(): args = args_setup() path_exists(args.input) path_dir(args.output) check_ext(args.input) if args.mapread_stats: mapping_stats(args.input) if (args.bam_filter): bam_filter(args.input, args.ref_match, args.map_qual, args.read_qual, args.pident, args.read_len) #Check the case if --bam-filter was not selected but default arguments were changed: if ( args.map_qual or args.read_qual or args.pident or args.read_len != 0 or args.ref_match != '.*' ): #if this statment is True, default arguments were changed, next is to check if --bam-filter was selected: if ( not args.bam_filter ): #if this statement is Flase, --bam-filter was selected & run with default arguments sys.exit( "Error: -b/--bam_filter argument is missing" ) #if True, --bam-filter was not selected but default arguments were changed if (args.bamtofastq): bam_to_fastq(args.input) if (args.mapped_reads): extract_mapped_reads(args.input) if (args.fastq_concat): fastq_concat(args.input) if (args.fastq_fasta): fastq_to_fasta(args.input) if (args.fastq_filter): fastq_filter(args.input, args.fastq_filter) if (args.vcf_filter): vcf_filter(args.input, args.vcf_filter) #several samtools commands (samtools functionallity with pysam) file_name = os.path.split( args.input[0])[1] #obtain only the file name withoout the path if args.sort: pos_args_len(args.input, 1) ps.sort("-o", file_name + ".sorted.bam", args.input[0]) print("Sorted bam file <" + file_name + ".sorted.bam" + "> was created") if args.index: pos_args_len(args.input, 1) ps.index(args.input[0]) print("Index file <" + file_name + ".bai" + "> was created") if args.flagstat: pos_args_len(args.input, 1) print("Reads mapping summary for %s" % (file_name)) print(ps.flagstat(args.input[0]))
def proc(arg): bamfile = arg[0] rmdup = arg[1] errorlog = arg[2] if errorlog == "stderr": errorlog = sys.stderr if rmdup == "False": rmdup = False bam_dir = "/".join(bamfile.split("/")[:-1]) + "/" bam_prefix = os.path.basename(bamfile).split(".bam")[0] mapped_bam = bam_dir + bam_prefix + "_mapped.bam" rmdup_bam = bam_dir + bam_prefix + "_rmdup.bam" sort_bam = bam_dir + bam_prefix + "_sort" stat_dir = bam_dir + "stat/" if not os.path.exists(stat_dir): os.makedirs(stat_dir) if not os.path.exists(mapped_bam): print >> errorlog, "Removing unmapped..." mapped = 0 unmapped = 0 bam = pysam.Samfile(bamfile, 'rb') mb = pysam.Samfile(mapped_bam, 'wb', template=bam) try: for read in bam: if not read.is_unmapped: mapped = mapped + 1 mb.write(read) else: unmapped = unmapped + 1 except: errorlog.write( "Failed to remove unmapped reads: read number {0}\n".format( mapped + unmapped)) raise else: errorlog.write( "Unmapped read removal successful: Mapped {0}/Unmapped {1}\n". format(mapped, unmapped)) bam.close() mb.close() if not os.path.exists(sort_bam + ".bam"): print >> errorlog, "Sorting..." try: cmd_args = [ 'java', '-Xmx2g', '-jar', '/seq/picard/SortSam.jar', "=".join(["INPUT", mapped_bam]), "=".join(["OUTPUT", sort_bam + ".bam"]), "=".join(["SORT_ORDER", "coordinate"]) ] p = Popen(cmd_args, stdout=errorlog, stderr=errorlog) p.wait() except: errorlog.write("Sorting failed.\n") raise else: os.remove(mapped_bam) if not os.path.exists(rmdup_bam) and rmdup: print "Removing duplicates..." rmdup_metrics = stat_dir + bam_prefix + "_rmdup_metrics" cmd_args = [ 'java', '-Xmx2g', '-jar', '/seq/picard/MarkDuplicates.jar', "=".join(["INPUT", sort_bam + ".bam"]), "=".join(["OUTPUT", rmdup_bam]), "=".join(["METRICS_FILE", rmdup_metrics]), "=".join(["REMOVE_DUPLICATES", "true"]), "=".join(["ASSUME_SORTED", "true"]) ] try: p = Popen(cmd_args, stdout=errorlog, stderr=errorlog) p.wait() except: errorlog.write("Failed to remove duplicates.\n") raise try: print >> errorlog, "Indexing..." pysam.index(rmdup_bam) except SamtoolsError as detail: print >> errorlog, "Indexing failed: ", detail else: try: print >> errorlog, "Indexing..." sort_bam = sort_bam + ".bam" pysam.index(sort_bam) except SamtoolsError as detail: print >> errorlog, "Indexing failed: ", detail bamfile_fs = open(bam_dir + "stat/" + bam_prefix + "_stat", 'w') for line in pysam.flagstat(bamfile): bamfile_fs.write(line) bamfile_fs.close() return 0
def main(options): """ Runs all analysies one thing to do is make graphs fail gracefully """ #from subprocess import Popen, PIPE #host = Popen(["hostname"], stdout=PIPE).communicate()[0].strip() #gets clusters in a bed tools + names species clusters = options.clusters species = options.species clusters_bed = pybedtools.BedTool(clusters) #makes output file names clusters = str.replace(clusters, ".BED", "") options.k = map(int, options.k) outdir = options.outdir #sets up output dirs make_dir(outdir) assigned_dir = os.path.join(outdir, "assigned") misc_dir = os.path.join(outdir, "misc") fastadir = os.path.join(outdir, "fasta") kmerout = os.path.join(outdir, "kmer") homerout_base = os.path.join(outdir, "homer") make_dir(homerout_base) homerout = os.path.join(homerout_base, clusters) make_dir(assigned_dir) make_dir(misc_dir) make_dir(fastadir) make_dir(homerout) make_dir(kmerout) all_regions = (["all", "exon", "UTR3", "UTR5", "proxintron", "distintron"]) #Not quite sure whats going on here, but its one logical block #either reassigns clusters to genic regions or reads from already #made assigned lists if options.assign is False: try: cluster_regions, sizes, Gsizes = build_assigned_from_existing(assigned_dir, clusters, all_regions, options.nrand) print "I used a pre-assigned set of BED files... score!" except: print "I had problems retreiving region-assigned BED files from %s, i'll rebuild" % (assigned_dir) options.assign = True if options.assign is True: print "Assigning Clusters to Genic Regions" cluster_regions, sizes, Gsizes = assign_to_regions(clusters_bed,options.genome_location, options.regions_location, species=species, getseq=True, nrand=options.nrand) print "Done Assigning" print "Saving BED and Fasta Files...", #outputs little files (maybe move inside of assign to regions) sizes_out = open(os.path.join(assigned_dir, "%s.sizes.pickle" %(clusters)), 'w') pickle.dump(sizes, file=sizes_out) sizes_out.close() Gsizes_out = open(os.path.join(assigned_dir, "Gsizes.pickle"), 'w') pickle.dump(Gsizes, file=Gsizes_out) Gsizes_out.close() #this is where all saving happens for assign to regions for region in all_regions: of = clusters + "." + region+ ".real.BED" try: cluster_regions[region]['real'].saveas(os.path.join(assigned_dir, of)) except: continue for n in range(options.nrand): of = clusters + "." + region+ ".rand." + str(n) + ".BED" try: cluster_regions[region]['rand'][n].saveas(os.path.join(assigned_dir, of)) except: continue print "done" #creates pretty file names for all regions for region in all_regions: try: real_fa = fa_file(clusters, region=region, fd = fastadir, type="real") rand_fa = fa_file(clusters, region=region, fd = fastadir, type="random") cluster_regions[region]['real'].save_seqs(real_fa) l = list()#list of randoms for n in cluster_regions[region]['rand'].keys(): l.append(cluster_regions[region]['rand'][n]) write_seqs(rand_fa, l) except: continue print "Counting reads in clusters...", #generates data for figure 1 and 2 #gets reads in clusters (figure 1) #gets reads per cluster (figure 2) reads_in_clusters = 0 reads_per_cluster = list() for cluster in cluster_regions['all']['real']: chr, start, stop, name, score, strand, tstart, tstop = str(cluster).strip().split("\t") try: gene, n, reads = name.split("_") except: try: gene, n, reads = name.split(";")[0].split("_") except: pass if int(reads)> 1: reads_per_cluster.append(int(reads)) reads_in_clusters += int(reads) print "done" #need to get rid of this pickleing busniess, its a waste of space and doesn't work with other methods #gets total number of reads (figure 1) #gets total number of reads from clipper analysis (Need to make clipper automatically output #pickle file print "Getting total number of reads...", total_reads = 0; try: pickle_file = clusters + ".pickle" if os.path.exists(pickle_file): pf = pickle.load(open(pickle_file, 'rb')) else: print "Couldn't find %s" %(pickle_file) print "Found %s" %(pickle_file) for gene in pf: total_reads += gene['nreads'] #if clipper didn't output gets it from flagstat except: print "Couldn't find a pickled file, resorting to flagstat for total reads. (this includes intergenic reads)" flagstats = pysam.flagstat(options.bam) total_reads =int(flagstats[2].split(" ")[0]) print "done, there were %d" %(total_reads) print "Gathering bed lengths...", #one stat is just generated here #generates cluster lengths (figure 3) cluster_lengths = bedlengths(cluster_regions['all']['real']) print "done" ##This should be abstracted to some sort of list or something... #figures 5 and 6, builds pre-mrna, mrna exon and intron distributions mRNA_positions = list() premRNA_positions = list() intron_positions = list() exon_positions = list() #also builds figure 10 (exon distances) GENES, Gtypes = build_AS_STRUCTURE_dict(species, options.as_structure) types = {} for type in ["CE:", "SE:", "MXE:", "A5E:", "A3E:"]: types[type]=0 print "locating clusters within genes", try: #counts nearest exon to peak and gets RNA #gets rna positon for every line as well for line in (cluster_regions['all']['real']): mRNA_frac, premRNA_frac, exon_frac, intron_frac, nearest_type = RNA_position(line, GENES) if mRNA_frac is not None: mRNA_positions.append(mRNA_frac) if exon_frac is not None: exon_positions.append(exon_frac) if premRNA_frac is not None: premRNA_positions.append(premRNA_frac) if intron_frac is not None: intron_positions.append(intron_frac) if nearest_type is not None: try: types[nearest_type] += 1 except: types[nearest_type] =1 except: print "there were errors, skipping" print "done" #gtypes is total genomic content #types is what clusters are #generates figure 10 (exon distances) type_count = [types["CE:"], types["SE:"], types["MXE:"], types["A5E:"], types["A3E:"]] Gtype_count = [Gtypes["CE:"], Gtypes["SE:"], Gtypes["MXE:"], Gtypes["A5E:"], Gtypes["A3E:"]] ### write fasta files and run homer and/or kmer analysis if at least one analysis is requested #runs kmer and homer analysis kmer_results = {} if options.reMotif is True: for region in all_regions: #reads nicely named files real_fa = fa_file(clusters, region=region, fd = fastadir, type="real") rand_fa = fa_file(clusters, region=region, fd = fastadir, type="random") if options.k is not None: if options.homer is True: region_homer_out = os.path.join(homerout, region) run_homer(real_fa, rand_fa, options.k, outloc=region_homer_out) for k in options.k: kmer_results[k] = {} kmer_results[k][region] = kmerdiff(real_fa, rand_fa, k) kmerfile = clusters + ".k" + str(k) + "." + region + ".kmerdiff" kmerfile = os.path.join(kmerout, kmerfile) kmer_sorted_output = run_kmerdiff(real_fa, rand_fa, outfile=kmerfile, k=k) #all the different motifs that the user specifices motifs = list(options.motif) kmer_box_params = [kmerout, clusters, options.k, motifs] ###conservation --should use multiprocessing to speed this part up! #start of conservation logic, very slow... phast_values = list() #loads phastcons values of generates them again if options.rePhast is False: try: phast_values = pickle.load(open(os.path.join(misc_dir, "%s.phast.pickle" %(clusters)))) except: options.rePhast = True #generates again if options.rePhast is True: print "Fetching Phastcons Scores...", #phastcons values for all regions except "all" for region in all_regions[1:]: #skip "all" combine them later print ("%s..." %(region)), try: samplesize=1000 #because it takes so long to fetch only select 1000 of them, not actually #implemented if len(cluster_regions[region]['real']) > samplesize: R1 = cluster_regions[region]['real'] # R1 = random.sample(cluster_regions[region]['real'], samplesize) else: R1 = cluster_regions[region]['real'] #realPhast = get_phastcons(cluster_regions[region]['real'], species=options.species) print "getting real...", #gets phastcons values real regions realPhast = get_phastcons(R1, options.phastcons_location, species=options.species) randPhast = list() #logic for random stuff (could be precomputed) for i in range(options.nrand): if len(cluster_regions[region]['rand'][i]) > samplesize: R2 = cluster_regions[region]['rand'][i] #R2 = random.sample(cluster_regions[region]['rand'][i], samplesize) else: R2 = cluster_regions[region]['rand'][i] print ("getting rand %d" %(i)), randPhast.extend(get_phastcons(R2, options.phastcons_location, species=options.species).tolist()) #list of lists for real and random for every genic region phast_values.append(realPhast) phast_values.append(randPhast) except: continue #hacky selection of real values from phast_values all_real = np.concatenate(phast_values[::2]) #hacky selection of random values from phast_values all_rand = np.concatenate(phast_values[1::2]) #adds back in all and rand to phast_values list phast_values.insert(0,all_rand) phast_values.insert(0,all_real) pickout = open(os.path.join(misc_dir, "%s.phast.pickle" %(clusters)), 'w') pickle.dump(phast_values, file = pickout) Zscores = None #old. remove #build qc figure QCfig_params = [reads_in_clusters, (total_reads - reads_in_clusters), cluster_lengths, reads_per_cluster, premRNA_positions, mRNA_positions, exon_positions, intron_positions, Gsizes, sizes, Gtype_count, type_count, Zscores, homerout, kmer_box_params, phast_values] #save results pickout = open(os.path.join(outdir, "misc", "%s.qcfig_params.pickle" %(clusters)), 'w') pickle.dump(QCfig_params, file = pickout) QCfig = CLIP_Analysis_Display.CLIP_QC_figure(*QCfig_params) fn = clusters + ".QCfig.pdf" outFig = os.path.join(outdir, fn) #TODO Fix output of file (Don't know why its crashing right now print >> sys.stderr, outFig QCfig.savefig(outFig) ### does something with motifs doesn't appear to work right now #reads in existing precompiled motif file motifs = list(options.motif) if motifs is not None and False: #TODO hack to get stuff compiling fix soon motifBASE = options.motif_location fig = pylab.figure(figsize=(8.5, 11)) colors = ["red", "orange", "green", "blue", "purple", "brown", "black", "pink", "gray", "cyan", "magenta"] for i, motif in enumerate(motifs): mf = "motif_" + motif + ".BED" mfgz = "motif_" + motif + ".BED.gz" print os.path.join(motifBASE,species,mf) motifFILE = None if os.path.exists(os.path.join(motifBASE,species, mf)): motifFILE = os.path.join(motifBASE,species, mf) elif os.path.exists(os.path.join(motifBASE,species, mfgz)): motifFILE= os.path.join(motifBASE,species, mfgz) else: print "MOTIF BED FILE for motif: %s is not available, please build it" %(mf) continue #plots motif distance from the precompiled file to the clusters plot_motif_dist(cluster_regions, motifFILE, fig, color = colors[i], species=species, slopsize=200) pylab.savefig(clusters + ".motif_distribution.pdf")
def main(options): # from subprocess import Popen, PIPE host = Popen(["hostname"], stdout=PIPE).communicate()[0].strip() #print host #print mpl.get_backend() #print mpl.get_backend() clusters = options.clusters species = options.species CLUSTERS = pybedtools.BedTool(clusters) clusters = str.replace(clusters, ".BED", "") options.k= map(int, options.k) outdir = options.outdir def make_dir(dir_name): if not os.path.exists(dir_name): os.mkdir(dir_name) make_dir(outdir) assigned_dir = os.path.join(outdir, "assigned") misc_dir = os.path.join(outdir, "misc") fastadir = os.path.join(outdir, "fasta") kmerout = os.path.join(outdir, "kmer") homerout_base = os.path.join(outdir, "homer") make_dir(homerout_base) homerout = os.path.join(homerout_base, clusters) make_dir(assigned_dir) make_dir(misc_dir) make_dir(fastadir) make_dir(homerout) make_dir(kmerout) all_regions = (["all", "exon", "UTR3", "UTR5", "proxintron", "distintron"]) def fa_file(filename, region = None, fd=fastadir, type= "real"): if not os.path.exists(fd): raise Exception if region is not None: x =filename+"."+ region+ "."+ type+ ".fa" return os.path.join(fd, x) else: x = filename+ "."+ type + ".fa" return os.path.join(fd, x) if options.assign is False: try: CLUS_regions, sizes, Gsizes = build_assigned_from_existing(assigned_dir, clusters, all_regions, options.nrand) print "I used a pre-assigned set of BED files... score!" except: print "I had problems retreiving region-assigned BED files from %s, i'll rebuild" %(assigned_dir) options.assign=True if options.assign is True: print "Assigning Clusters to Genic Regions" CLUS_regions, sizes, Gsizes = assign_to_regions(CLUSTERS, species=species, getseq=True, nrand=options.nrand) print "Done Assigning" print "Saving BED and Fasta Files...", sizes_out = open(os.path.join(assigned_dir, "%s.sizes.pickle" %(clusters)), 'w') pickle.dump(sizes, file=sizes_out) sizes_out.close() Gsizes_out = open(os.path.join(assigned_dir, "Gsizes.pickle"), 'w') pickle.dump(Gsizes, file=Gsizes_out) Gsizes_out.close() for region in all_regions: of = clusters + "." + region+ ".real.BED" try: CLUS_regions[region]['real'].saveas(os.path.join(assigned_dir, of)) except: continue for n in range(options.nrand): of = clusters + "." + region+ ".rand." + str(n) + ".BED" try: CLUS_regions[region]['rand'][n].saveas(os.path.join(assigned_dir, of)) except: continue print "done" for region in all_regions: try: real_fa = fa_file(clusters, region=region, type="real") rand_fa = fa_file(clusters, region=region, type="random") CLUS_regions[region]['real'].save_seqs(real_fa) l = list()#list of randoms for n in CLUS_regions[region]['rand'].keys(): l.append(CLUS_regions[region]['rand'][n]) write_seqs(rand_fa, l) except: continue print "Counting reads in clusters...", reads_in_clusters = 0 reads_per_cluster = list() for cluster in CLUS_regions['all']['real']: chr, start, stop, name, score, strand, tstart, tstop = str(cluster).strip().split("\t") try: gene, n, reads = name.split("_") except: try: gene, n, reads = name.split(";")[0].split("_") except: pass if int(reads)> 1: reads_per_cluster.append(int(reads)) reads_in_clusters += int(reads) print "done" #bamfile = pysam.Samfile(options.bam, 'rb') print "Getting total number of reads...", total_reads = 0; try: pickle_file = clusters + ".pickle" if os.path.exists(pickle_file): pf = pickle.load(open(pickle_file, 'rb')) else: print "Couldn't find %s" %(pickle_file) print "Found %s" %(pickle_file) for gene in pf: total_reads += gene['nreads'] except: print "Couldn't find a pickled file, resorting to flagstat for total reads. (this includes intergenic reads)" flagstats = pysam.flagstat(options.bam) total_reads =int(flagstats[2].split(" ")[0]) print "done, there were %d" %(total_reads) print "Gathering bed lengths...", cluster_lengths = bedlengths(CLUS_regions['all']['real']) print "done" ## mRNA_positions = list() premRNA_positions = list() intron_positions = list() exon_positions = list() GENES, Gtypes = build_AS_STRUCTURE_dict(species) types = {} for type in ["CE:", "SE:", "MXE:", "A5E:", "A3E:"]: types[type]=0 print "locating clusters within genes", try: for line in (CLUS_regions['all']['real']): mRNA_frac, premRNA_frac, exon_frac, intron_frac, nearest_type = RNA_position(line, GENES) if mRNA_frac is not None: mRNA_positions.append(mRNA_frac) if exon_frac is not None: exon_positions.append(exon_frac) if premRNA_frac is not None: premRNA_positions.append(premRNA_frac) if intron_frac is not None: intron_positions.append(intron_frac) if nearest_type is not None: try: types[nearest_type] += 1 except: types[nearest_type] =1 except: print "there were errors, skipping" print "done" type_count = [types["CE:"], types["SE:"], types["MXE:"], types["A5E:"], types["A3E:"]] Gtype_count = [Gtypes["CE:"], Gtypes["SE:"], Gtypes["MXE:"], Gtypes["A5E:"], Gtypes["A3E:"]] ### write fasta files and run homer and/or kmer analysis if at least one analysis is requested if options.reMotif is True: for region in all_regions: try: real_fa = fa_file(clusters, region=region, type="real") rand_fa = fa_file(clusters, region=region, type="random") if options.k is not None: if options.homer is True: region_homer_out = os.path.join(homerout, region) run_homer(real_fa, rand_fa, options.k, outloc=region_homer_out) for k in options.k: kmerfile = clusters + ".k" + str(k) + "." + region + ".kmerdiff" kmerfile = os.path.join(kmerout, kmerfile) kmer_sorted_output = run_kmerdiff(real_fa, rand_fa, outfile=kmerfile, k=k) except: continue motifs = list(options.motif) kmer_box_params = [kmerout, clusters, options.k, motifs] ###conservation --should use multiprocessing to speed this part up! phast_values = list() if options.rePhast is False: try: phast_values = pickle.load(open(os.path.join(misc_dir, "%s.phast.pickle" %(clusters)))) except: options.rePhast =True if options.rePhast is True: print "Fetching Phastcons Scores...", for region in all_regions[1:]:#skip "all" combine them later print ("%s..." %(region)), try: samplesize=1000 if len(CLUS_regions[region]['real']) > samplesize: R1 = CLUS_regions[region]['real'] # R1 = random.sample(CLUS_regions[region]['real'], samplesize) else: R1 = CLUS_regions[region]['real'] #realPhast = get_phastcons(CLUS_regions[region]['real'], species=options.species) print "getting real...", realPhast = get_phastcons(R1, species=options.species) randPhast=list() for i in range(options.nrand): if len(CLUS_regions[region]['rand'][i]) > samplesize: R2 = CLUS_regions[region]['rand'][i] #R2 = random.sample(CLUS_regions[region]['rand'][i], samplesize) else: R2 = CLUS_regions[region]['rand'][i] print ("getting rand %d" %(i)), randPhast.extend(get_phastcons(R2, species=options.species).tolist()) phast_values.append(realPhast) phast_values.append(randPhast) except: continue all_real = np.concatenate(phast_values[::2]) all_rand = np.concatenate(phast_values[1::2]) phast_values.insert(0,all_rand) phast_values.insert(0,all_real) pickout = open(os.path.join(misc_dir, "%s.phast.pickle" %(clusters)), 'w') pickle.dump(phast_values, file = pickout) Zscores = None #old. remove QCfig_params = [reads_in_clusters, (total_reads - reads_in_clusters), cluster_lengths, reads_per_cluster, premRNA_positions, mRNA_positions, exon_positions, intron_positions, Gsizes, sizes, Gtype_count, type_count, Zscores, homerout, kmer_box_params, phast_values] pickout = open(os.path.join(outdir, "misc", "%s.qcfig_params.pickle" %(clusters)), 'w') pickle.dump(QCfig_params, file = pickout) QCfig = CLIP_QC_figure(*QCfig_params) fn = clusters + ".QCfig.pdf" outFig = os.path.join(outdir, fn) QCfig.savefig(outFig) ### motifs = list(options.motif) motifBASE = basedir + "/lovci/projects/ucscBED" if motifs is not None: fig = pylab.figure(figsize=(8.5, 11)) colors = ["red", "orange", "green", "blue", "purple", "brown", "black", "pink", "gray", "cyan", "magenta"] for i, motif in enumerate(motifs): mf = "motif_" + motif + ".BED" mfgz = "motif_" + motif + ".BED.gz" print os.path.join(motifBASE,species,mf) motifFILE = None # import code # code.interact(local=locals()) if os.path.exists(os.path.join(motifBASE,species, mf)): motifFILE = os.path.join(motifBASE,species, mf) elif os.path.exists(os.path.join(motifBASE,species, mfgz)): motifFILE= os.path.join(motifBASE,species, mfgz) else: print "MOTIF BED FILE for motif: %s is not available, please build it" %(mf) continue plot_motif_dist(CLUS_regions, motifFILE, fig, color = colors[i], species=species, slopsize=200) pylab.savefig(clusters + ".motif_distribution.pdf")