def extract_boundary_sequences(self): read_pairs = self.get_read_pairs() trimmed_read_pairs = self.trim_barcodes(read_pairs) total_reads = 0 well_formed = 0 long_enough = 0 counters = {'positions': {orientation: Counter() for orientation in orientations}, 'control_ids': Counter(), 'polyA_lengths': Counter(), 'left_ids': Counter(), 'right_ids': Counter(), 'joint_lengths': Counter(), } with open(self.file_names['five_prime_boundaries'], 'w') as fives_fh, \ open(self.file_names['three_prime_boundaries'], 'w') as threes_fh: for R1, R2 in trimmed_read_pairs: total_reads += 1 five_payload_read, three_payload_read = TIF_seq_structure.find_boundary_sequences(R1, R2, counters) if five_payload_read and three_payload_read: well_formed += 1 if len(five_payload_read.seq) >= self.min_payload_length and \ len(three_payload_read.seq) >= self.min_payload_length: long_enough += 1 fives_fh.write(fastq.make_record(*five_payload_read)) threes_fh.write(fastq.make_record(*three_payload_read)) # Pop off of counters so that what is left at the end can be written # directly to the id_counts file. position_counts = counters.pop('positions') for orientation in orientations: key = '{0}_{1}'.format(orientation, 'positions') array = counts_to_array(position_counts[orientation]) self.write_file(key, array) polyA_lengths = counts_to_array(counters.pop('polyA_lengths')) self.write_file('polyA_lengths', polyA_lengths) joint_lengths = counts_to_array(counters.pop('joint_lengths'), dim=2) self.write_file('joint_lengths', joint_lengths) self.write_file('id_counts', counters) self.summary.extend( [('Total read pairs', total_reads), ('Well-formed', well_formed), ('Long enough', long_enough), ], )
def zero_padded_array(self, counts): array = utilities.counts_to_array(counts) if len(array) < self.max_read_length + 1: padded_array = np.zeros(self.max_read_length + 1, int) padded_array[:len(array)] += array else: padded_array = array return padded_array
def filter_mappings(self): num_unmapped = 0 num_entirely_genomic = 0 num_nonunique = 0 num_unique = 0 nongenomic_lengths = Counter() sam_file = pysam.Samfile(self.file_names['accepted_hits']) region_fetcher = genomes.build_region_fetcher(self.file_names['genome'], load_references=True, sam_file=sam_file, ) extended_sorter = sam.AlignmentSorter(sam_file.references, sam_file.lengths, self.file_names['extended'], ) filtered_sorter = sam.AlignmentSorter(sam_file.references, sam_file.lengths, self.file_names['extended_filtered'], ) extended_mappings = (trim.extend_polyA_end(mapping, region_fetcher) for mapping in sam_file) mapping_groups = utilities.group_by(extended_mappings, lambda m: m.qname) with extended_sorter, filtered_sorter: for qname, group in mapping_groups: for m in group: extended_sorter.write(m) min_nongenomic_length = min(trim.get_nongenomic_length(m) for m in group) nongenomic_lengths[min_nongenomic_length] += 1 if min_nongenomic_length == 0: num_entirely_genomic += 1 continue nonunique = len(group) > 1 or any(m.mapq < 40 for m in group) if nonunique: num_nonunique += 1 continue num_unique += 1 for m in group: filtered_sorter.write(m) self.summary.extend( [('Mapped with no non-genomic A\'s', num_entirely_genomic), ('Nonunique', num_nonunique), ('Unique', num_unique), ], ) nongenomic_lengths = utilities.counts_to_array(nongenomic_lengths) self.write_file('nongenomic_lengths', nongenomic_lengths)
def extract_fragment_lengths(bam_file_name): def concordantly_mapped(mapping): return not mapping.mate_is_unmapped and \ not mapping.is_unmapped and \ mapping.rnext == mapping.tid and \ abs(mapping.tlen) < 10000 bam_fh = pysam.Samfile(bam_file_name) TLENs = (abs(mapping.tlen) for mapping in bam_fh if mapping.is_read1 and concordantly_mapped(mapping)) fragment_lengths = Counter(TLENs) # Note that counts_to_array implicitly discards negative key values. fragment_lengths = utilities.counts_to_array(fragment_lengths) return fragment_lengths
def collapse_fragments(self): get_position = annotation.make_convertor(self.MappingAnnotation, self.PositionAnnotation, ) get_fragment = annotation.make_convertor(self.MappingAnnotation, self.FragmentAnnotation, ) amplification_counts = Counter() sq_lines = sam.get_sq_lines(self.merged_file_names['sorted_clean_sam']) sam_lines = self.get_sorted_sam_lines() with open(self.file_names['collapsed_sam'], 'w') as collapsed_fh, \ open(self.file_names['collisions'], 'w') as collisions_fh: for sq_line in sq_lines: collapsed_fh.write(sq_line) position_groups = utilities.group_by(sam_lines, get_position) for position_annotation, position_lines in position_groups: fragment_counts = Counter() position_count = len(position_lines) fragment_groups = utilities.group_by(position_lines, get_fragment) for fragment_annotation, fragment_lines in fragment_groups: fragment_count = len(fragment_lines) fragment_counts[fragment_count] += 1 amplification_counts['{},{}'.format(position_count, fragment_count)] += 1 collapsed_annotation = self.CollapsedAnnotation(count=fragment_count, **fragment_annotation) new_line = sam.splice_in_name(fragment_lines[0], collapsed_annotation.identifier) collapsed_fh.write(new_line) fragment_counts = utilities.counts_to_array(fragment_counts) if position_count > 100: collisions_fh.write(position_annotation.identifier + '\n') collisions_fh.write(','.join(map(str, fragment_counts)) + '\n') sam.make_sorted_bam(self.file_names['collapsed_sam'], self.file_names['collapsed_bam'], ) self.write_file('amplification_counts', amplification_counts)
def trim_reads(self, read_pairs): total_reads = 0 long_enough_reads = 0 trimmed_lengths = Counter() barcodes = Counter() for R1, R2 in read_pairs: total_reads += 1 barcodes[R2.seq[:len(self.barcode)]] += 1 # R2 isn't expected to have adapters sequence because it will # have to get through the A tail first. position = adapters.find_adapter(self.adapter_in_R1, 3, R1.seq) trimmed_lengths[position] += 1 if position < 12: continue long_enough_reads += 1 R1_slice = slice(None, position) # position points to where the barcode starts in R1. The length # of the trimmed R2 read should be equal to position. R2_slice = slice(len(self.barcode), len(self.barcode) + position) processed_R1 = fastq.Read(R1.name, R1.seq[R1_slice], R1.qual[R1_slice]) processed_R2 = fastq.Read(R2.name, R2.seq[R2_slice], R2.qual[R2_slice]) yield processed_R1, processed_R2 trimmed_lengths = utilities.counts_to_array(trimmed_lengths) self.write_file('trimmed_lengths', trimmed_lengths) self.write_file('barcodes', barcodes) self.summary.extend( [('Total read pairs', total_reads), ('Long enough', long_enough_reads), ] )
def filter_mappings(self): num_unmapped = 0 num_entirely_genomic = 0 num_nonunique = 0 num_unique = 0 nongenomic_lengths = Counter() sam_file = pysam.Samfile(self.file_names['accepted_hits']) region_fetcher = genomes.build_region_fetcher( self.file_names['genome'], load_references=True, sam_file=sam_file, ) extended_sorter = sam.AlignmentSorter( sam_file.references, sam_file.lengths, self.file_names['extended'], ) filtered_sorter = sam.AlignmentSorter( sam_file.references, sam_file.lengths, self.file_names['extended_filtered'], ) extended_mappings = (trim.extend_polyA_end(mapping, region_fetcher) for mapping in sam_file) mapping_groups = utilities.group_by(extended_mappings, lambda m: m.qname) with extended_sorter, filtered_sorter: for qname, group in mapping_groups: for m in group: extended_sorter.write(m) min_nongenomic_length = min( trim.get_nongenomic_length(m) for m in group) nongenomic_lengths[min_nongenomic_length] += 1 if min_nongenomic_length == 0: num_entirely_genomic += 1 continue nonunique = len(group) > 1 or any(m.mapq < 40 for m in group) if nonunique: num_nonunique += 1 continue num_unique += 1 for m in group: filtered_sorter.write(m) self.summary.extend([ ('Mapped with no non-genomic A\'s', num_entirely_genomic), ('Nonunique', num_nonunique), ('Unique', num_unique), ], ) nongenomic_lengths = utilities.counts_to_array(nongenomic_lengths) self.write_file('nongenomic_lengths', nongenomic_lengths)
def trim_reads(self, read_pairs): total_reads = 0 long_enough_reads = 0 trimmed_lengths = Counter() barcodes = Counter() truncated_in_R1 = self.adapter_in_R1[1:] truncated_in_R2 = self.adapter_in_R2[1:] for R1, R2 in read_pairs: total_reads += 1 barcodes[R2.seq[:len(self.barcode)]] += 1 # Check for weird thing where expected overhang base doesn't # exist in primer dimers. R1_dimer_distance = adapters.adapter_hamming_distance( R1.seq, truncated_in_R1, len(R1.seq), len(truncated_in_R1), len(self.barcode), ) R2_dimer_distance = adapters.adapter_hamming_distance( R2.seq, truncated_in_R2, len(R2.seq), len(truncated_in_R2), len(self.barcode), ) if R1_dimer_distance <= 3 and R2_dimer_distance <= 3: position = len(self.barcode) else: position = adapters.consistent_paired_position( R1.seq, R2.seq, self.adapter_in_R1, self.adapter_in_R2, 19, 3, ) if position != None: trimmed_lengths[position] += 1 if position - len(self.barcode) < 12: continue else: position = len(R1.seq) long_enough_reads += 1 payload_slice = slice(len(self.barcode), position) processed_R1 = fastq.Read(R1.name, R1.seq[payload_slice], R1.qual[payload_slice]) processed_R2 = fastq.make_record(R2.name, R2.seq[payload_slice], R2.qual[payload_slice]) yield processed_R1, processed_R2 trimmed_lengths = utilities.counts_to_array(trimmed_lengths) self.write_file('trimmed_lengths', trimmed_lengths) self.write_file('barcodes', barcodes) self.summary.extend([ ('Total read pairs', total_reads), ('Long enough', long_enough_reads), ])
def trim_reads(self, read_pairs): total_reads = 0 long_enough_reads = 0 trimmed_lengths = Counter() barcodes = Counter() truncated_in_R1 = self.adapter_in_R1[1:] truncated_in_R2 = self.adapter_in_R2[1:] for R1, R2 in read_pairs: total_reads += 1 barcodes[R2.seq[:len(self.barcode)]] += 1 # Check for weird thing where expected overhang base doesn't # exist in primer dimers. R1_dimer_distance = adapters.adapter_hamming_distance(R1.seq, truncated_in_R1, len(R1.seq), len(truncated_in_R1), len(self.barcode), ) R2_dimer_distance = adapters.adapter_hamming_distance(R2.seq, truncated_in_R2, len(R2.seq), len(truncated_in_R2), len(self.barcode), ) if R1_dimer_distance <= 3 and R2_dimer_distance <= 3: position = len(self.barcode) else: position = adapters.consistent_paired_position(R1.seq, R2.seq, self.adapter_in_R1, self.adapter_in_R2, 19, 3, ) if position != None: trimmed_lengths[position] += 1 if position - len(self.barcode) < 12: continue else: position = len(R1.seq) long_enough_reads += 1 payload_slice = slice(len(self.barcode), position) processed_R1 = fastq.Read(R1.name, R1.seq[payload_slice], R1.qual[payload_slice]) processed_R2 = fastq.make_record(R2.name, R2.seq[payload_slice], R2.qual[payload_slice]) yield processed_R1, processed_R2 trimmed_lengths = utilities.counts_to_array(trimmed_lengths) self.write_file('trimmed_lengths', trimmed_lengths) self.write_file('barcodes', barcodes) self.summary.extend( [('Total read pairs', total_reads), ('Long enough', long_enough_reads), ] )
def combine_mappings(self): num_unmapped = 0 num_R1_unmapped = 0 num_R2_unmapped = 0 num_nonunique = 0 num_discordant = 0 num_disoriented = 0 num_concordant = 0 tlens = Counter() R1_mappings = pysam.Samfile(self.file_names['R1_accepted_hits']) R1_unmapped = pysam.Samfile(self.file_names['R1_unmapped']) all_R1 = sam.merge_by_name(R1_mappings, R1_unmapped) R1_grouped = utilities.group_by(all_R1, lambda m: m.qname) R2_mappings = pysam.Samfile(self.file_names['R2_accepted_hits']) R2_unmapped = pysam.Samfile(self.file_names['R2_unmapped']) all_R2 = sam.merge_by_name(R2_mappings, R2_unmapped) R2_grouped = utilities.group_by(all_R2, lambda m: m.qname) group_pairs = izip(R1_grouped, R2_grouped) alignment_sorter = sam.AlignmentSorter(R1_mappings.references, R1_mappings.lengths, self.file_names['combined'], ) with alignment_sorter: for (R1_qname, R1_group), (R2_qname, R2_group) in group_pairs: #print R1_qname, R2_qname if fastq.get_pair_name(R1_qname) != fastq.get_pair_name(R2_qname): # Ensure that the iteration through pairs is in sync. print R1_qname, R2_qname raise ValueError R1_unmapped = any(m.is_unmapped for m in R1_group) R2_unmapped = any(m.is_unmapped for m in R2_group) if R1_unmapped: num_R1_unmapped += 1 if R2_unmapped: num_R2_unmapped += 1 if R1_unmapped or R2_unmapped: num_unmapped += 1 continue R1_nonunique = len(R1_group) > 1 or any(m.mapq < 40 for m in R1_group) R2_nonunique = len(R2_group) > 1 or any(m.mapq < 40 for m in R2_group) if R1_nonunique or R2_nonunique: num_nonunique += 1 continue R1_m = R1_group.pop() R2_m = R2_group.pop() R1_strand = sam.get_strand(R1_m) R2_strand = sam.get_strand(R2_m) tlen = max(R1_m.aend, R2_m.aend) - min(R1_m.pos, R2_m.pos) discordant = (R1_m.tid != R2_m.tid) or (R1_strand) == (R2_strand) or (tlen > 10000) if discordant: num_discordant += 1 continue # Reminder: the protocol produces anti-sense reads. if R1_strand == '-': if R1_m.pos < R2_m.pos: num_disoriented += 1 continue elif R1_strand == '+': if R2_m.pos < R1_m.pos: num_disoriented += 1 continue combined_read = paired_end.combine_paired_mappings(R1_m, R2_m) tlens[tlen] += 1 if combined_read: # Flip combined_read back to the sense strand. if combined_read.is_reverse: combined_read.is_reverse = False else: combined_read.is_reverse = True trim.set_nongenomic_length(combined_read, 0) alignment_sorter.write(combined_read) num_concordant += 1 self.summary.extend( [('Unmapped', num_unmapped), ('R1 unmapped', num_R1_unmapped), ('R2 unmapped', num_R2_unmapped), ('Nonunique', num_nonunique), ('Discordant', num_discordant), ('Unexpected orientation', num_disoriented), ('Concordant', num_concordant), ], ) tlens = utilities.counts_to_array(tlens) self.write_file('tlens', tlens)