def process_full_length_mappings(self): clean_bam = pysam.Samfile(self.file_names['clean_bam']) type_shape = (self.max_read_length + 1, self.max_read_length, fastq.MAX_EXPECTED_QUAL + 1, 6, 6, ) type_counts = np.zeros(type_shape, int) # To avoid counting mismatches in non-unique mappings multiple times, # a dummy secondary_type_counts array is passed to # trim_mismatches_from_start for secondary mappings. secondary_type_counts = np.zeros(type_shape, int) clean_trimmed_length_counts = Counter() region_fetcher = genomes.build_region_fetcher(self.file_names['genome'], load_references=True, sam_file=clean_bam, ) for mapping in clean_bam: if mapping.is_secondary: counts_array = secondary_type_counts else: counts_array = type_counts trimmed_from_start = trim.trim_mismatches_from_start(mapping, region_fetcher, counts_array, ) trimmed_from_end = trim.trim_nongenomic_polyA_from_end(trimmed_from_start, region_fetcher, ) if not trimmed_from_end.is_unmapped and not trimmed_from_end.is_secondary: clean_trimmed_length_counts[trimmed_from_end.qlen] += 1 yield trimmed_from_end self.write_file('mismatches', type_counts) clean_trimmed_lengths = self.zero_padded_array(clean_trimmed_length_counts) self.write_file('lengths', {'clean_trimmed': clean_trimmed_lengths})
def process_remapped(self): clean_bam = pysam.Samfile(self.file_names['remapped_accepted_hits']) type_shape = (self.max_read_length + 1, self.max_read_length, fastq.MAX_EXPECTED_QUAL + 1, 6, 6, ) type_counts = np.zeros(type_shape, int) remapped_length_counts = Counter() region_fetcher = genomes.build_region_fetcher(self.file_names['genome'], load_references=True, sam_file=clean_bam, ) for mapping in clean_bam: trimmed_from_start = trim.trim_mismatches_from_start(mapping, region_fetcher, type_counts, ) # Add back any genomic A's that were trimmed as part of mappings and # any remaining A's from the first non-genomic onward as soft clipped # bases for visualization in IGV. extended = trim.extend_polyA_end(trimmed_from_start, region_fetcher, trimmed_twice=True, ) if not extended.is_unmapped and not extended.is_secondary: remapped_length_counts[extended.qlen] += 1 yield extended remapped_lengths = self.zero_padded_array(remapped_length_counts) self.write_file('lengths', {'remapped': remapped_lengths})