def calculate_distance_distribution(interleave_fhand, index_fpath, max_clipping, max_distance=None, tempdir=None, threads=None): bam_fhand = NamedTemporaryFile(suffix='.bam') extra_params = ['-a', '-M'] bwa = map_with_bwamem(index_fpath, interleave_fpath=interleave_fhand.name, extra_params=extra_params) map_process_to_sortedbam(bwa, bam_fhand.name, key='queryname', tempdir=tempdir) bamfile = Samfile(bam_fhand.name) stats = {'outies': IntCounter(), 'innies': IntCounter(), 'others': IntCounter()} for grouped_mates in _group_alignments_reads_by_qname(bamfile): mates = _split_mates(grouped_mates) for aligned_read1 in _get_totally_mapped_alignments(mates[0], max_clipping): for aligned_read2 in _get_totally_mapped_alignments(mates[1], max_clipping): if aligned_read1.rname == aligned_read2.rname: aligned_reads = [aligned_read1, aligned_read2] distance = _find_distance(aligned_reads) if _mates_are_outies(aligned_reads): kind = 'outies' elif _mates_are_innies(aligned_reads): kind = 'innies' else: kind = 'others' if max_distance is None or max_distance > distance: stats[kind][distance] += 1 return stats
def test_classify_paired_reads(self): index_fpath = os.path.join(TEST_DATA_DIR, 'ref_example.fasta') # Non chimeric query1 = '>seq1 1:N:0:GATCAG\nGGGATCGCAGACCCATCTCGTCAGCATGTACCCTTGCTACATTGAACTT\n' query2 = '>seq1 2:N:0:GATCAG\nAGGAGGGATCGGGCACCCACGGCGCGGTAGACTGAGGCCTTCTCGAACT\n' # Chimeric query3 = '>seq2 1:N:0:GATCAG\nAAGTTCAATGTAGCAAGGGTACATGCTGACGAGATGGGTCTGCGATCCC\n' query4 = '>seq2 2:N:0:GATCAG\nACGTGGATGCGGCGACGGCCCTACGGCACATACTGTTATTAGGGTCACT\n' # unknown query5 = '>seq3 1:N:0:GATCAG\nAGTGACCCTAATAACAGTATGTGCCGTAGGGCCGTCGCCGCATCCACGT\n' query6 = '>seq3 2:N:0:GATCAG\nGTCGTGCGCAGCCATTGAGACCTTCCTAGGGTTTTCCCCATGGAATCGG\n' query = query1 + query2 + query5 + query6 + query3 + query4 in_fhand = NamedTemporaryFile() in_fhand.write(query) in_fhand.flush() bam_fhand = NamedTemporaryFile(suffix='.bam') extra_params = ['-a', '-M'] bwa = map_with_bwamem(index_fpath, interleave_fpath=in_fhand.name, extra_params=extra_params) map_process_to_sortedbam(bwa, bam_fhand.name, key='queryname') result = classify_mapped_reads(bam_fhand, mate_distance=2000) for pair, kind in result: if kind == NON_CHIMERIC: assert 'seq1' in get_name(pair[0]) elif kind == UNKNOWN: assert 'seq3' in get_name(pair[0]) elif kind == CHIMERA: assert 'seq2' in get_name(pair[0]) else: self.fail()
def classify_chimeras(in_fhand, index_fpath, mate_distance, out_fhand, chimeras_fhand=None, unknown_fhand=None, tempdir=None, threads=None, settings=get_setting('CHIMERAS_SETTINGS')): '''It maps sequences from input files, sorts them and writes to output files according to its classification''' bam_fhand = NamedTemporaryFile(suffix='.bam') extra_params = ['-a', '-M'] bwa = map_with_bwamem(index_fpath, interleave_fpath=in_fhand.name, extra_params=extra_params) map_process_to_sortedbam(bwa, bam_fhand.name, key='queryname', tempdir=tempdir) for pair, kind in classify_mapped_reads(bam_fhand, settings=settings, mate_distance=mate_distance): if kind is NON_CHIMERIC: write_seqs(pair, out_fhand) elif kind is CHIMERA and chimeras_fhand is not None: write_seqs(pair, chimeras_fhand) elif kind is UNKNOWN and unknown_fhand is not None: write_seqs(pair, unknown_fhand)
def test_classify_paired_reads(self): index_fpath = os.path.join(TEST_DATA_DIR, 'ref_example.fasta') #Non chimeric query1 = '>seq1 1:N:0:GATCAG\nGGGATCGCAGACCCATCTCGTCAGCATGTACCCTTGCTACATTGAACTT\n' query2 = '>seq1 2:N:0:GATCAG\nAGGAGGGATCGGGCACCCACGGCGCGGTAGACTGAGGCCTTCTCGAACT\n' #Chimeric query3 = '>seq2 1:N:0:GATCAG\nAAGTTCAATGTAGCAAGGGTACATGCTGACGAGATGGGTCTGCGATCCC\n' query4 = '>seq2 2:N:0:GATCAG\nACGTGGATGCGGCGACGGCCCTACGGCACATACTGTTATTAGGGTCACT\n' #unknown query5 = '>seq3 1:N:0:GATCAG\nAGTGACCCTAATAACAGTATGTGCCGTAGGGCCGTCGCCGCATCCACGT\n' query6 = '>seq3 2:N:0:GATCAG\nGTCGTGCGCAGCCATTGAGACCTTCCTAGGGTTTTCCCCATGGAATCGG\n' query = query1 + query2 + query5 + query6 + query3 + query4 in_fhand = NamedTemporaryFile() in_fhand.write(query) in_fhand.flush() bam_fhand = NamedTemporaryFile(suffix='.bam') extra_params = ['-a', '-M'] bwa = map_with_bwamem(index_fpath, interleave_fpath=in_fhand.name, extra_params=extra_params) map_process_to_sortedbam(bwa, bam_fhand.name, key='queryname') result = classify_mapped_reads(bam_fhand, mate_distance=2000) for pair, kind in result: if kind == NON_CHIMERIC: assert 'seq1' in get_name(pair[0]) elif kind == UNKNOWN: assert 'seq3' in get_name(pair[0]) elif kind == CHIMERA: assert 'seq2' in get_name(pair[0]) else: self.fail()
def _pre_trim(self, trim_packet): seqs = [s for seqs in trim_packet[SEQS_PASSED]for s in seqs] reads_fhand = NamedTemporaryFile(dir=self._tempdir, suffix='.trimming') write_seqs(seqs, reads_fhand) reads_fhand.flush() bwa = map_with_bwamem(self._index_fpath, interleave_fpath=reads_fhand.name) bam_fhand = NamedTemporaryFile(dir=self._tempdir) map_process_to_sortedbam(bwa, bam_fhand.name, key='queryname', tempdir=self._tempdir) self._bam_fhand = bam_fhand reads_fhand.close()
def _pre_trim(self, trim_packet): seqs = [s for seqs in trim_packet[SEQS_PASSED]for s in seqs] reads_fhand = NamedTemporaryFile(dir=self._tempdir, suffix='.trimming') write_seqs(seqs, reads_fhand) reads_fhand.flush() bwa = map_with_bwamem(self._index_fpath, interleave_fpath=reads_fhand.name) bam_fhand = NamedTemporaryFile(dir=self._tempdir) map_process_to_sortedbam(bwa, bam_fhand.name, key='queryname', tempdir=self._tempdir) self._bam_fhand = bam_fhand reads_fhand.close()
def calculate_distance_distribution(interleave_fhand, index_fpath, max_clipping, max_distance=None, tempdir=None, threads=None): bam_fhand = NamedTemporaryFile(suffix='.bam') extra_params = ['-a', '-M'] bwa = map_with_bwamem(index_fpath, interleave_fpath=interleave_fhand.name, extra_params=extra_params, threads=threads) map_process_to_sortedbam(bwa, bam_fhand.name, key='queryname', tempdir=tempdir) return calculate_distance_distribution_in_bam(bam_fhand, max_clipping=max_clipping, max_distance=max_distance)
def classify_chimeras(in_fhand, index_fpath, mate_distance, out_fhand, chimeras_fhand=None, unknown_fhand=None, tempdir=None, threads=None, settings=get_setting('CHIMERAS_SETTINGS')): '''It maps sequences from input files, sorts them and writes to output files according to its classification''' bam_fhand = NamedTemporaryFile(suffix='.bam') extra_params = ['-a', '-M'] bwa = map_with_bwamem(index_fpath, interleave_fpath=in_fhand.name, extra_params=extra_params) map_process_to_sortedbam(bwa, bam_fhand.name, key='queryname', tempdir=tempdir) for pair, kind in classify_mapped_reads(bam_fhand, settings=settings, mate_distance=mate_distance): if kind is NON_CHIMERIC: write_seqs(pair, out_fhand) elif kind is CHIMERA and chimeras_fhand is not None: write_seqs(pair, chimeras_fhand) elif kind is UNKNOWN and unknown_fhand is not None: write_seqs(pair, unknown_fhand)
def calculate_distance_distribution(interleave_fhand, index_fpath, max_clipping, max_distance=None, tempdir=None, threads=None): bam_fhand = NamedTemporaryFile(suffix='.bam') extra_params = ['-a', '-M'] bwa = map_with_bwamem(index_fpath, interleave_fpath=interleave_fhand.name, extra_params=extra_params, threads=threads) map_process_to_sortedbam(bwa, bam_fhand.name, key='queryname', tempdir=tempdir) bamfile = AlignmentFile(bam_fhand.name) stats = { 'outies': IntCounter(), 'innies': IntCounter(), 'others': IntCounter() } for grouped_mates in _group_alignments_reads_by_qname(bamfile): mates = _split_mates(grouped_mates) for aligned_read1 in _get_totally_mapped_alignments( mates[0], max_clipping): for aligned_read2 in _get_totally_mapped_alignments( mates[1], max_clipping): if aligned_read1.rname == aligned_read2.rname: aligned_reads = [aligned_read1, aligned_read2] distance = _find_distance(aligned_reads) if _mates_are_outies(aligned_reads): kind = 'outies' elif _mates_are_innies(aligned_reads): kind = 'innies' else: kind = 'others' if max_distance is None or max_distance > distance: stats[kind][distance] += 1 return stats