def test_bam_file_to_fasta_pair_files_region(self): '''Test bam_file_to_fasta_pair_files with a region''' tmp1 = 'tmp.to_fasta_1.fa' tmp2 = 'tmp.to_fasta_2.fa' mapping.bam_file_to_fasta_pair_files(os.path.join(data_dir, 'mapping_test.smalt.out.sorted.bam'), tmp1, tmp2, chromosome='ref', start=25, end=150) self.assertTrue(filecmp.cmp(os.path.join(data_dir, 'mapping_test.bam_to_region_1.fa'), tmp1)) self.assertTrue(filecmp.cmp(os.path.join(data_dir, 'mapping_test.bam_to_region_2.fa'), tmp2)) os.unlink(tmp1) os.unlink(tmp2)
def test_bam_file_to_fasta_pair_files(self): '''Test bam_file_to_fasta_pair_files''' tmp1 = 'tmp.to_fasta_1.fa' tmp2 = 'tmp.to_fasta_2.fa' mapping.bam_file_to_fasta_pair_files(os.path.join(data_dir, 'mapping_test.smalt.out.bam'), tmp1, tmp2) self.assertTrue(filecmp.cmp(os.path.join(data_dir, 'mapping_test.reads_1.fasta'), tmp1)) self.assertTrue(filecmp.cmp(os.path.join(data_dir, 'mapping_test.reads_2.fasta'), tmp2)) os.unlink(tmp1) os.unlink(tmp2)
def _trim_strand_biased_ends(self, reads_prefix, out_prefix=None, tag_as_trimmed=False, break_contigs=False): tmpdir = tempfile.mkdtemp(prefix='tmp.trim_strand_biased_ends.', dir=os.getcwd()) tmp_prefix = os.path.join(tmpdir, 'out') sorted_bam = tmp_prefix + '.bam' unsorted_bam = tmp_prefix + '.unsorted.bam' original_map_minid = self.map_minid self.map_minid = 0.9 self._map_reads(reads_prefix + '_1.fa', reads_prefix + '_2.fa', tmp_prefix, sort_reads=True) assert os.path.exists(sorted_bam) self.map_minid = original_map_minid new_contigs = [] contigs_to_remove = set() for ctg in self.contigs: if break_contigs: subcontigs = self._subcontigs_from_strand_bias(sorted_bam, ctg) if len(subcontigs): new_contigs.extend(subcontigs) contigs_to_remove.add(ctg) elif ctg not in self.contigs_trimmed_for_strand_bias: self._trim_contig_for_strand_bias(sorted_bam, ctg) # contig could get completely trimmed so nothing left, in which # case, we need to remove it if len(self.contigs[ctg]) == 0: contigs_to_remove.add(ctg) elif tag_as_trimmed: self.contigs_trimmed_for_strand_bias.add(ctg) for ctg in contigs_to_remove: self._remove_contig(ctg) for ctg in new_contigs: self._add_contig(ctg, min_length=0.75 * self.self.seed_stop_length) if out_prefix is not None: mapping.bam_file_to_fasta_pair_files(unsorted_bam, out_prefix + '_1.fa', out_prefix + '_2.fa', remove_proper_pairs=True) shutil.rmtree(tmpdir)
def _trim_strand_biased_ends(self, reads_prefix, out_prefix=None, tag_as_trimmed=False, break_contigs=False): tmpdir = tempfile.mkdtemp(prefix='tmp.trim_strand_biased_ends.', dir=os.getcwd()) tmp_prefix = os.path.join(tmpdir, 'out') sorted_bam = tmp_prefix + '.bam' unsorted_bam = tmp_prefix + '.unsorted.bam' original_map_minid = self.map_minid self.map_minid = 0.9 self._map_reads(reads_prefix + '_1.fa', reads_prefix + '_2.fa', tmp_prefix, sort_reads=True) assert os.path.exists(sorted_bam) self.map_minid = original_map_minid new_contigs = [] contigs_to_remove = set() for ctg in self.contigs: if break_contigs: subcontigs = self._subcontigs_from_strand_bias(sorted_bam, ctg) if len(subcontigs): new_contigs.extend(subcontigs) contigs_to_remove.add(ctg) elif ctg not in self.contigs_trimmed_for_strand_bias: self._trim_contig_for_strand_bias(sorted_bam, ctg) # contig could get completely trimmed so nothing left, in which # case, we need to remove it if len(self.contigs[ctg]) == 0: contigs_to_remove.add(ctg) elif tag_as_trimmed: self.contigs_trimmed_for_strand_bias.add(ctg) for ctg in contigs_to_remove: self._remove_contig(ctg) for ctg in new_contigs: self._add_contig(ctg, min_length=0.75 * self.self.seed_stop_length) if out_prefix is not None: mapping.bam_file_to_fasta_pair_files(unsorted_bam, out_prefix + '_1.fa', out_prefix + '_2.fa', remove_proper_pairs=True) shutil.rmtree(tmpdir)
def _get_unmapped_pairs(self, reads1, reads2, out_prefix): self._map_reads(reads1, reads2, out_prefix, required_flag=12) mapping.bam_file_to_fasta_pair_files(out_prefix + '.bam', out_prefix + '_1.fa', out_prefix + '_2.fa') os.unlink(out_prefix + '.bam')
def _read_pair_extension_iterations(self, reads_prefix, out_prefix, no_map_contigs=None): if no_map_contigs is None: no_map_contigs = set() assert (len(self.contigs) > len(no_map_contigs)) if self.verbose: print('{:-^79}'.format(' ' + out_prefix + ' start extension subiteration 0001 '), flush=True) bases_added = self._extend_with_reads(reads_prefix, out_prefix + '.1', no_map_contigs) current_reads_prefix = reads_prefix if bases_added == 0: return True try_contig_trim = False i = 1 while self._worth_extending() or try_contig_trim: i += 1 if self.verbose: print('{:-^79}'.format(' ' + out_prefix + ' start extension subiteration ' + str(i).zfill(4) + ' '), flush=True) if i % 5 == 0: tmpdir = tempfile.mkdtemp(prefix='tmp.filter_reads.', dir=os.getcwd()) tmp_prefix = os.path.join(tmpdir, 'out') bam = tmp_prefix + '.bam' original_map_minid = self.map_minid self.map_minid = 0.9 self._map_reads(current_reads_prefix + '_1.fa', current_reads_prefix + '_2.fa', tmp_prefix) self.map_minid = original_map_minid filter_prefix = reads_prefix + '.subiter.' + str(i) + '.reads' mapping.bam_file_to_fasta_pair_files(bam, filter_prefix + '_1.fa', filter_prefix + '_2.fa', remove_proper_pairs=True) if current_reads_prefix != reads_prefix: os.unlink(current_reads_prefix + '_1.fa') os.unlink(current_reads_prefix + '_2.fa') current_reads_prefix = filter_prefix shutil.rmtree(tmpdir) iter_prefix = out_prefix + '.' + str(i) bases_added = self._extend_with_reads(current_reads_prefix, iter_prefix, no_map_contigs) if bases_added == 0: if not try_contig_trim: if self.verbose: print(' No bases added. Try trimming contigs') self._trim_strand_biased_ends(reads_prefix, tag_as_trimmed=False) if len(self.contigs) <= len(no_map_contigs): if self.verbose: print( ' lost contigs during trimming. No more iterations' ) return False self.trim_contigs(self.contig_iter_trim) try_contig_trim = True else: if self.verbose: print( ' No bases added after trimming. No more iterations' ) break else: try_contig_trim = False if current_reads_prefix != reads_prefix: os.unlink(current_reads_prefix + '_1.fa') os.unlink(current_reads_prefix + '_2.fa') return True
def _get_unmapped_pairs(self, reads1, reads2, out_prefix): self._map_reads(reads1, reads2, out_prefix, required_flag=12) mapping.bam_file_to_fasta_pair_files(out_prefix + '.bam', out_prefix + '_1.fa', out_prefix + '_2.fa') os.unlink(out_prefix + '.bam')
def _read_pair_extension_iterations(self, reads_prefix, out_prefix, no_map_contigs=None): if no_map_contigs is None: no_map_contigs = set() assert(len(self.contigs) > len(no_map_contigs)) if self.verbose: print('{:-^79}'.format(' ' + out_prefix + ' start extension subiteration 0001 '), flush=True) bases_added = self._extend_with_reads(reads_prefix, out_prefix + '.1', no_map_contigs) current_reads_prefix = reads_prefix if bases_added == 0: return True try_contig_trim = False i = 1 while self._worth_extending() or try_contig_trim: i += 1 if self.verbose: print('{:-^79}'.format(' ' + out_prefix + ' start extension subiteration ' + str(i).zfill(4) + ' '), flush=True) if i % 5 == 0: tmpdir = tempfile.mkdtemp(prefix='tmp.filter_reads.', dir=os.getcwd()) tmp_prefix = os.path.join(tmpdir, 'out') bam = tmp_prefix + '.bam' original_map_minid = self.map_minid self.map_minid = 0.9 self._map_reads(current_reads_prefix + '_1.fa', current_reads_prefix + '_2.fa', tmp_prefix) self.map_minid = original_map_minid filter_prefix = reads_prefix + '.subiter.' + str(i) + '.reads' mapping.bam_file_to_fasta_pair_files(bam, filter_prefix + '_1.fa', filter_prefix + '_2.fa', remove_proper_pairs=True) if current_reads_prefix != reads_prefix: os.unlink(current_reads_prefix + '_1.fa') os.unlink(current_reads_prefix + '_2.fa') current_reads_prefix = filter_prefix shutil.rmtree(tmpdir) iter_prefix = out_prefix + '.' + str(i) bases_added = self._extend_with_reads(current_reads_prefix, iter_prefix, no_map_contigs) if bases_added == 0: if not try_contig_trim: if self.verbose: print(' No bases added. Try trimming contigs') self._trim_strand_biased_ends(reads_prefix, tag_as_trimmed=False) if len(self.contigs) <= len(no_map_contigs): if self.verbose: print(' lost contigs during trimming. No more iterations') return False self.trim_contigs(self.contig_iter_trim) try_contig_trim = True else: if self.verbose: print(' No bases added after trimming. No more iterations') break else: try_contig_trim = False if current_reads_prefix != reads_prefix: os.unlink(current_reads_prefix + '_1.fa') os.unlink(current_reads_prefix + '_2.fa') return True