def main(): # Parse command line arguments args = parse_args() bamfile = args.bam bedfile = args.bed outfile = args.output if outfile: outstream = open(outfile, 'w') else: outstream = sys.stdout max_distance = args.dist out_header = '\t'.join( ['STR_chr', 'STR_start', 'STR_stop', 'motif', 'reflen', 'count']) outstream.write(out_header + '\n') #STR_bed = parse_bed(args.bed, position_base=0) STR_bed = bt.BedTool(bedfile) readlen = detect_readlen(bamfile) # Read bam bam = pysam.Samfile(bamfile, 'rb') # Get relevant chromosomes required_chroms = [] unpaired = 0 total = 0 for chrom in bam.references: if chrom.startswith('STR-'): required_chroms.append(chrom) for chrom in required_chroms: motif = chrom.split('-')[1] all_positions = [] all_segments = bam.fetch(reference=chrom) for read in all_segments: total += 1 try: mate_chr = read.next_reference_name except ValueError: unpaired += 1 continue mate_start = read.next_reference_start mate_stop = mate_start + readlen all_positions.append([mate_chr, mate_start, mate_stop]) # Strategy: # Merge all overlapping intervals # Keep the count of reads corresponding to each merged interval (i.e. 1 for each read contained in it) # Assign each interval to the closest STR (within 500 bp? - the insert size) with the correct motif, adding together the count of reads # Check motif: == normalise_str() # There should be two 1-2 intervals per STR, likely one for each flank. # Report the read count for each STR if len(all_positions) > 0: motif_bed = bt.BedTool(all_positions).sort() # Merge all the intervals, then count how many of the original intervals overlap the merged ones (4th column) motif_coverage = motif_bed.merge(stream=True).coverage(b=motif_bed, counts=True) tmp_bed = 'tmp-' + randomletters( 8 ) + '.bed' #create temporary file for bedtools to write to and pandas to read since streams don't seem to work closest_STR = motif_coverage.closest(STR_bed, d=True, stream=True).saveas(tmp_bed) colnames = [ 'chr', 'start', 'stop', 'count', 'STR_chr', 'STR_start', 'STR_stop', 'motif', 'reflen', 'distance' ] df = pd.read_csv(tmp_bed, sep='\t', header=None, names=colnames) os.remove(tmp_bed) #delete temporary file # Filter out loci that are too far away df = df.loc[df['distance'] <= max_distance, :] df['motif'] = df['motif'].map( normalise_str) # Normalise the STR motif to enable comparisons # Remove STRs that don't match the motif df = df.loc[df['motif'] == normalise_str(motif), :] df = df.loc[:, [ 'STR_chr', 'STR_start', 'STR_stop', 'motif', 'count', 'reflen' ]] summed = df.groupby( ['STR_chr', 'STR_start', 'STR_stop', 'motif', 'reflen'], as_index=False).aggregate(np.sum) summed.to_csv(outstream, sep='\t', header=False, index=False) outstream.close() if unpaired == total: sys.exit( 'ERROR: all reads tested appear to be unpaired. You may wish to check your bam file is paired end and correctly formed.' ) elif unpaired > 0: sys.stderr.write( 'WARNING: it appears that {} of {} reads checked were unpaired and so no useful data could be obtained from them.\n' )
def locus_counts(bamfiles, bedfile, outfile, max_distance): # Check bamfiles have unique names #print(bamfiles, type(bamfiles)) if not isinstance(bamfiles, list): raise TypeError('Expecting a list, got {}'.format(type(bamfiles))) if len(set(bamfiles)) < len(bamfiles): sys.exit( 'ERROR: There were multiple bamfiles with the same filename. Please check your input' ) #STR_bed = parse_bed(args.bed, position_base=0) STR_bed = bt.BedTool(bedfile).sort() all_results = [] for bamfile in bamfiles: readlen, count_noCIGAR = detect_readlen(bamfile) # Read bam bam = pysam.Samfile(bamfile, 'rb') # Get relevant chromosomes required_chroms = [] unpaired = 0 total = 0 for chrom in bam.references: if chrom.startswith('STR-'): required_chroms.append(chrom) # Check if any STR- chromosomes if len(required_chroms) == 0: sys.exit( 'ERROR: There were no reads mapping to chromosomes with names starting with "STR-" in {0}. Are you sure this data is mapped to a reference genome with STR decoy chromosomes?' .format(bamfile)) for chrom in required_chroms: motif = chrom.split('-')[1] all_positions = [] all_segments = bam.fetch(reference=chrom) for read in all_segments: #if read.is_secondary: # continue total += 1 try: mate_chr = read.next_reference_name except ValueError: unpaired += 1 continue mate_start = read.next_reference_start mate_stop = mate_start + readlen all_positions.append([mate_chr, mate_start, mate_stop]) # Strategy: # Merge all overlapping intervals # Keep the count of reads corresponding to each merged interval (i.e. 1 for each read contained in it) # Assign each interval to the closest STR (within 500 bp? - the insert size) with the correct motif, adding together the count of reads # Check motif: == normalise_str() # There should be two 1-2 intervals per STR, likely one for each flank. # Report the read count for each STR if len(all_positions) > 0: motif_bed = bt.BedTool(all_positions).sort() # Merge all the intervals, then count how many of the original intervals overlap the merged ones (4th column) motif_coverage = motif_bed.merge(stream=True).coverage( b=motif_bed, counts=True, nonamecheck=True) tmp_bed = 'tmp-' + randomletters( 8 ) + '.bed' #create temporary file for bedtools to write to and pandas to read since streams don't seem to work closest_STR = motif_coverage.closest( STR_bed, d=True, stream=True, nonamecheck=True).saveas(tmp_bed) colnames = [ 'chr', 'start', 'stop', 'count', 'STR_chr', 'STR_start', 'STR_stop', 'motif', 'reflen', 'distance' ] df = pd.read_csv(tmp_bed, sep='\t', header=None, names=colnames) os.remove(tmp_bed) #delete temporary file # Filter out loci that are too far away df = df.loc[df['distance'] <= max_distance, :] df['motif'] = df['motif'].map( normalise_str ) # Normalise the STR motif to enable comparisons # Remove STRs that don't match the motif df = df.loc[df['motif'] == normalise_str(motif), :] df = df.loc[:, [ 'STR_chr', 'STR_start', 'STR_stop', 'motif', 'count', 'reflen' ]] all_results.append(df) if total == 0: sys.exit( 'ERROR: there were no reads overlapping the target STR regions. This may indicate a problem with the input file.\n' ) elif unpaired == total: sys.exit( 'ERROR: all {0} reads overlapping the target STR regions appear to be unpaired. You may wish to check your bam file is paired-end and correctly formed.\n' .format(total)) elif unpaired > 0: sys.stderr.write( 'WARNING: it appears that {0} of the {1} reads overlapping the target STR regions were unpaired and so no useful data could be obtained from them.\n' .format(unpaired, total)) # Sum counts from multiple bam files and multiple rows if len(all_results) == 1: df_total = all_results[0] else: df_total = pd.concat(all_results, ignore_index=True) summed = df_total.groupby( ['STR_chr', 'STR_start', 'STR_stop', 'motif', 'reflen'], as_index=False).aggregate(np.sum) # Print a warning message in case of reads without a CIGAR string if count_noCIGAR > 0: sys.stderr.write('WARNING: ' + str(count_noCIGAR) + ' read(s) in ' + bamfile + ' file had no CIGAR string.\n') if total == 0: sys.exit( 'ERROR: there were no reads overlapping the target STR regions. This may indicate a problem with the input file.\n' ) elif unpaired == total: sys.exit( 'ERROR: all {0} reads overlapping the target STR regions appear to be unpaired. You may wish to check your bam file is paired-end and correctly formed.\n' .format(total)) elif unpaired > 0: sys.stderr.write( 'WARNING: it appears that {0} of the {1} reads overlapping the target STR regions were unpaired and so no useful data could be obtained from them.\n' .format(unpaired, total)) # Write results if outfile: outstream = open(outfile, 'w') else: outstream = sys.stdout out_header = '\t'.join( ['STR_chr', 'STR_start', 'STR_stop', 'motif', 'reflen', 'count']) outstream.write(out_header + '\n') outstring = summed.to_csv(sep='\t', header=False, index=False) outstream.write(outstring) outstream.close()