def test_write_fasta(self): '''Test write_fasta''' s = seed.Seed(seq='GAAGGCGGCAGC') tmpfile = 'tmp.seed.fa' s.write_fasta(tmpfile, 'spam') self.assertTrue(filecmp.cmp(tmpfile, os.path.join(data_dir, 'seed_test.write_fasta.fa'), shallow=False)) os.unlink(tmpfile)
def test_extend_with_reads_as_single_end(self): '''Test _extend_with_reads_as_single_end''' s = seed.Seed(seq='AGGCT', ext_min_cov=1, verbose=2) reads1 = os.path.join(data_dir, 'kcount_test.reads_1.fasta') reads2 = os.path.join(data_dir, 'kcount_test.reads_2.fasta') s._extend_with_reads_as_single_end(reads1, reads2) self.assertEqual('TGAGGCTAT', s.seq)
def test_extensions_from_reads_file(self): '''Test _extensions_from_reads_file''' s = seed.Seed(seq='AGGCT') l, r = s._extensions_from_reads_file(os.path.join(data_dir, 'kcount_test.reads_1.fasta')) self.assertListEqual(l, []) self.assertListEqual(r, ['A', 'AT', 'AT']) l, r = s._extensions_from_reads_file(os.path.join(data_dir, 'kcount_test.reads_2.fasta')) self.assertListEqual(l, ['G', 'TG', 'TG']) self.assertListEqual(r, [])
def add_new_seed_contig(self, reads1, reads2, contig_name=None, max_attempts=10): if len(self.contigs): tmpdir = tempfile.mkdtemp(prefix='tmp.make_seed.', dir=os.getcwd()) tmp_prefix = os.path.join(tmpdir, 'out') seed_reads1 = tmp_prefix + '_1.fa' seed_reads2 = tmp_prefix + '_2.fa' if contig_name is not None: self._map_reads(reads1, reads2, tmp_prefix, required_flag=5, exclude_flag=8, mate_ref=contig_name) mapping.bam_to_fasta(tmp_prefix + '.bam', seed_reads1) seed_reads2 = None else: self._get_unmapped_pairs(reads1, reads2, tmp_prefix) else: seed_reads1 = reads1 seed_reads2 = reads2 made_seed = False for i in range(max_attempts): s = seed.Seed(reads1=seed_reads1, reads2=seed_reads2, extend_length=self.seed_ext_max_bases, seed_length=self.seed_start_length, seed_min_count=self.seed_min_kmer_count, seed_max_count=self.seed_max_kmer_count, ext_min_cov=self.seed_min_cov, ext_min_ratio=self.seed_min_ratio, verbose=self.verbose, kmc_threads=self.kmc_threads, map_threads=self.threads, sequences_to_ignore=self.used_seeds, contigs_to_check=self.contigs) if s.seq is None or len(s.seq) == 0: break if self.seed_overlap_length is None: s.overlap_length = len(s.seq) else: s.overlap_length = self.seed_overlap_length s.extend(reads1, reads2, self.seed_stop_length) self.used_seeds.add(s.seq) if len(s.seq) >= 0.75 * self.seed_stop_length: made_seed = True break elif self.verbose: print(" Couldn't extend seed enough. That was attempt", i+1, 'of', max_attempts, flush=True) if len(self.contigs): shutil.rmtree(tmpdir) if not made_seed or len(s.seq) == 0: return None if self.verbose: print(" Extended seed OK.", flush=True) new_name = 'seeded.' + '1'.zfill(5) i = 1 while new_name in self.contigs: i += 1 new_name = 'seeded.' + str(i).zfill(5) self._add_contig(pyfastaq.sequences.Fasta(new_name, s.seq)) return new_name
def test_extension_from_read(self): '''Test _test_extension_from_read''' s = seed.Seed(seq='AGGCT') self.assertEqual(None, s._extension_from_read(pyfastaq.sequences.Fasta('x', 'AAAAA'))) self.assertEqual(None, s._extension_from_read(pyfastaq.sequences.Fasta('x', 'AGGC'))) self.assertEqual('A', s._extension_from_read(pyfastaq.sequences.Fasta('x', 'AGGCTA'))) self.assertEqual('AT', s._extension_from_read(pyfastaq.sequences.Fasta('x', 'AGGCTAT'))) self.assertEqual('AT', s._extension_from_read(pyfastaq.sequences.Fasta('x', 'GGGAGGCTAT'))) self.assertEqual('AA', s._extension_from_read(pyfastaq.sequences.Fasta('x', 'TTAGCCT'))) self.assertEqual(None, s._extension_from_read(pyfastaq.sequences.Fasta('x', 'AAAAA'), left=True)) self.assertEqual(None, s._extension_from_read(pyfastaq.sequences.Fasta('x', 'AGGCTA'), left=True)) self.assertEqual('GT', s._extension_from_read(pyfastaq.sequences.Fasta('x', 'GTAGGCTA'), left=True)) self.assertEqual('GT', s._extension_from_read(pyfastaq.sequences.Fasta('x', 'GTAGGCTATTC'), left=True)) self.assertEqual('GT', s._extension_from_read(pyfastaq.sequences.Fasta('x', 'AGCCTAC'), left=True))
def _make_new_seed(self, seed_name): if self.verbose: print('Making new seed for', seed_name, ' ... start') tmp_prefix = os.path.join(self.tmpdir, 'out') seed_reads = tmp_prefix + '.' + seed_name + '.reads_1.fa' if len(self.original_seeds[seed_name]) > self.seed_stop_length: start = int(0.5 * len(self.original_seeds[seed_name]) - 0.5 * self.seed_stop_length) end = int(0.5 * len(self.original_seeds[seed_name]) + 0.5 * self.seed_stop_length) else: start = None end = None if self.verbose: print('Making new seed for', seed_name, ' ... getting reads') mapping.bam_file_to_region_fasta(self.bam_file, seed_reads, seed_name, start, end) if self.verbose: print('Making new seed for', seed_name, ' ... finding most common kmer') new_seed = seed.Seed(extend_length=self.extend_length, overlap_length=self.overlap_length, reads1=seed_reads, ext_min_cov=self.ext_min_cov, ext_min_ratio=self.ext_min_ratio, verbose=self.verbose, seed_length=self.seed_length, seed_min_count=self.seed_min_count, seed_max_count=self.seed_max_count, kmc_threads=self.kmc_threads, map_threads=self.threads) if len(new_seed) == 0: print('Warning: could not get most common kmer for', seed_name) return if self.verbose: print('Making new seed for', seed_name, ' ... extending most common kmer') new_seed.extend(self.reads1, self.reads2, self.seed_stop_length) f = pyfastaq.utils.open_file_write(tmp_prefix + '.' + seed_name + '.fa') print(pyfastaq.sequences.Fasta('seed.' + seed_name, new_seed.seq[10:-10]), file=f) pyfastaq.utils.close(f) if self.verbose: print('Making new seed for', seed_name, ' ... finished')
def test_len(self): '''Test len''' s = seed.Seed(seq='AGGCT') self.assertEqual(5, len(s)) s.seq = None self.assertEqual(0, len(s))