def _map_and_cluster_reads(self): if self.verbose: print('{:_^79}'.format(' Mapping reads to clustered genes '), flush=True) minimap_prefix = 'minimap' self._minimap_reads_to_all_ref_seqs(self.clusters_tsv, self.all_ref_seqs_fasta, self.reads_1, self.reads_2, minimap_prefix, verbose=self.verbose) if self.verbose: print('Finished mapping\n') print('{:_^79}'.format(' Generating clusters '), flush=True) self.cluster_to_rep, self.cluster_read_counts, self.cluster_base_counts, self.insert_hist, self.proper_pairs = self._load_minimap_files( minimap_prefix, self.insert_hist_bin) self.cluster_to_dir = { x: os.path.join(self.tmp_dir, x) for x in self.cluster_to_rep } reads_file_for_read_store = minimap_prefix + '.reads' if len(self.cluster_read_counts): if self.verbose: filehandle = sys.stdout else: filehandle = None self.read_store = read_store.ReadStore(reads_file_for_read_store, os.path.join( self.outdir, 'read_store'), log_fh=filehandle) os.unlink(reads_file_for_read_store) if self.clean: for suffix in [ 'cluster2representative', 'clusterCounts', 'insertHistogram', 'properPairs' ]: filename = minimap_prefix + '.' + suffix try: os.unlink(filename) except: pass if self.verbose: print('Found', self.proper_pairs, 'proper read pairs from minimap') print('Total clusters to perform local assemblies:', len(self.cluster_to_dir), flush=True)
def test_get_reads_fq_interleave(self): '''Test get_reads fastq interleaved''' infile = os.path.join(data_dir, 'read_store_test_get_reads.in') expected = os.path.join(data_dir, 'read_store_test_get_reads.expected.reads.fq') outprefix = 'tmp.read_store_test_get_reads' reads = outprefix + '.reads_1.fq' rstore = read_store.ReadStore(infile, outprefix) got_reads, got_bases = rstore.get_reads('cluster2', reads) self.assertEqual(6, got_reads) self.assertEqual(24, got_bases) self.assertTrue(filecmp.cmp(expected, reads)) os.unlink(outprefix + '.gz') os.unlink(outprefix + '.gz.tbi') os.unlink(reads)
def test_clean(self): '''Test clean''' infile = os.path.join(data_dir, 'read_store_test_clean.in') outprefix = 'tmp.read_store_test_clean' self.assertFalse(os.path.exists(outprefix)) self.assertFalse(os.path.exists(outprefix + '.gz')) self.assertFalse(os.path.exists(outprefix + '.gz.tbi')) rstore = read_store.ReadStore(infile, outprefix) self.assertFalse(os.path.exists(outprefix)) self.assertTrue(os.path.exists(outprefix + '.gz')) self.assertTrue(os.path.exists(outprefix + '.gz.tbi')) rstore.clean() self.assertFalse(os.path.exists(outprefix)) self.assertFalse(os.path.exists(outprefix + '.gz')) self.assertFalse(os.path.exists(outprefix + '.gz.tbi'))
def _test_run(self): '''test run''' rstore_infile = os.path.join(data_dir, 'read_filter_test_run.in.read_store') ref_fasta = os.path.join(data_dir, 'read_filter_test_run.in.ref.fa') expected_reads1 = os.path.join(data_dir, 'read_filter_test_run.expected.reads_1.fq') expected_reads2 = os.path.join(data_dir, 'read_filter_test_run.expected.reads_2.fq') tmp_rstore_prefix = 'tmp.filter_test_run.read_store' tmp_reads1 = 'tmp.filter_test_run.reads_1.fq' tmp_reads2 = 'tmp.filter_test_run.reads_2.fq' rstore = read_store.ReadStore(rstore_infile, tmp_rstore_prefix) rfilter = read_filter.ReadFilter(rstore, ref_fasta, '1', sys.stdout) got_reads, got_bases = rfilter.run(tmp_reads1, tmp_reads2) self.assertEqual(12, got_reads) self.assertEqual(912, got_bases) self.assertTrue(filecmp.cmp(expected_reads1, tmp_reads1, shallow=False)) self.assertTrue(filecmp.cmp(expected_reads2, tmp_reads2, shallow=False)) os.unlink(tmp_reads1) os.unlink(tmp_reads2) rstore.clean()
def test_get_reads_fq_pair(self): '''Test get_reads fastq pair''' infile = os.path.join(data_dir, 'read_store_test_get_reads.in') expected1 = os.path.join( data_dir, 'read_store_test_get_reads.expected.reads_1.fq') expected2 = os.path.join( data_dir, 'read_store_test_get_reads.expected.reads_2.fq') outprefix = 'tmp.read_store_test_get_reads' reads1 = outprefix + '.reads_1.fq' reads2 = outprefix + '.reads_2.fq' rstore = read_store.ReadStore(infile, outprefix) got_reads, got_bases = rstore.get_reads('cluster2', reads1, out2=reads2) self.assertEqual(6, got_reads) self.assertEqual(24, got_bases) self.assertTrue(filecmp.cmp(expected1, reads1)) self.assertTrue(filecmp.cmp(expected2, reads2)) os.unlink(outprefix + '.gz') os.unlink(outprefix + '.gz.tbi') os.unlink(reads1) os.unlink(reads2)
def test_get_reads_subset(self): '''Test get_reads subset''' infile = os.path.join(data_dir, 'read_store_test_get_reads.in') expected1 = os.path.join( data_dir, 'read_store_test_get_reads.expected.reads_subset.1.fq') expected2 = os.path.join( data_dir, 'read_store_test_get_reads.expected.reads_subset.2.fq') wanted_ids = {1, 11} outprefix = 'tmp.read_store_test_get_reads' reads1 = outprefix + '.reads_1.fq' reads2 = outprefix + '.reads_2.fq' rstore = read_store.ReadStore(infile, outprefix) got_reads, got_bases = rstore.get_reads('cluster2', reads1, out2=reads2, wanted_ids=wanted_ids) self.assertEqual(4, got_reads) self.assertEqual(16, got_bases) self.assertTrue(filecmp.cmp(expected1, reads1)) self.assertTrue(filecmp.cmp(expected2, reads2)) os.unlink(outprefix + '.gz') os.unlink(outprefix + '.gz.tbi') os.unlink(reads1) os.unlink(reads2)