def test_get_ids(self): '''test _get_ids''' infile = os.path.join(data_dir, 'cdhit_test_get_ids.fa') expected = {'id1', 'id2', 'id3'} r = cdhit.Runner(infile, 'out') got = r._get_ids(infile) self.assertEqual(expected, got)
def test_get_run_cmd_with_unlimited_memory(self): '''test_get_run_cmd_with_unlimited_memory''' fa_infile = os.path.join(data_dir, 'cdhit_test_run_get_clusters_from_dict_rename.in.fa') r = cdhit.Runner(fa_infile, memory_limit=0) run_cmd = r.get_run_cmd('foo/bar/file.out') match = re.search('^.+ -o foo/bar/file.out -c 0.9 -T 1 -s 0.0 -d 0 -bak 1 -M 0$', run_cmd) self.assertIsNotNone(match, msg="Command output was " + run_cmd)
def test_fake_run_fail(self): '''test fake_run with non-unique names''' infile = os.path.join(data_dir, 'cdhit_test_fake_run.non-unique.in.fa') tmpfile = 'tmp.cdhit_test_fake_run.out.non-unique.fa' r = cdhit.Runner(infile, tmpfile) with self.assertRaises(cdhit.Error): clusters = r.fake_run() os.unlink(tmpfile)
def test_run_min_cluster_number_42(self): '''test run with min_cluster_number 42''' infile = os.path.join(data_dir, 'cdhit_test_run.in.fa') r = cdhit.Runner(infile, min_cluster_number=42) clusters = r.run() expected_clusters = { '42': {'seq1', 'seq2', 'seq3'}, '43': {'seq4'}, } self.assertEqual(clusters, expected_clusters)
def test_run(self): '''test run''' infile = os.path.join(data_dir, 'cdhit_test_run.in.fa') r = cdhit.Runner(infile) clusters = r.run() expected_clusters = { '0': {'seq1', 'seq2', 'seq3'}, '1': {'seq4'}, } self.assertEqual(clusters, expected_clusters)
def test_rename_fasta(self): '''test _rename_fasta''' infile = os.path.join(data_dir, 'cdhit_test_rename_fasta.in.fa') tmpfile = 'tmp.rename_fasta.out.fa' expected = os.path.join(data_dir, 'cdhit_test_rename_fasta.out.fa') names_dict = {'a': 'seq1', 'b': 'seq2', 'c': 'seq3'} r = cdhit.Runner(infile, 'out') r._rename_fasta(infile, tmpfile, names_dict) self.assertTrue(filecmp.cmp(expected, tmpfile, shallow=False)) os.unlink(tmpfile)
def test_run_get_clusters_from_file(self): '''test run_get_clusters_from_file''' fa_infile = os.path.join(data_dir, 'cdhit_test_run_get_clusters_from_dict.in.fa') clusters_infile = os.path.join(data_dir, 'cdhit_test_run_get_clusters_from_dict.in.clusters') r = cdhit.Runner(fa_infile) clusters = r.run_get_clusters_from_file(clusters_infile, {'seq1', 'seq2', 'seq3'}) expected_clusters = { '0': {'seq1', 'seq2'}, '1': {'seq3'}, } self.assertEqual(clusters, expected_clusters)
def test_run_get_clusters_from_file_with_renaming(self): '''test run_get_clusters_from_file with renaming''' rename_dict = {'seq2': 'seq2_renamed'} fa_infile = os.path.join(data_dir, 'cdhit_test_run_get_clusters_from_dict_rename.in.fa') clusters_infile = os.path.join(data_dir, 'cdhit_test_run_get_clusters_from_dict.in.clusters') r = cdhit.Runner(fa_infile) clusters = r.run_get_clusters_from_file(clusters_infile, {'seq1', 'seq2_renamed', 'seq3'}, rename_dict=rename_dict) expected_clusters = { '0': {'seq1', 'seq2_renamed'}, '1': {'seq3'}, } self.assertEqual(clusters, expected_clusters)
def test_enumerate_fasta(self): '''test _enumerate_fasta''' infile = os.path.join(data_dir, 'cdhit_test_enumerate_fasta.in.fa') expected_outfile = os.path.join(data_dir, 'cdhit_test_enumerate_fasta.out.fa') tmpfile = 'tmp.test_enumerate_fasta.out.fa' expected_dict = {'1': 'a', '2': 'b', '3': 'c'} r = cdhit.Runner(infile, 'out') got_dict = r._enumerate_fasta(infile, tmpfile) self.assertTrue(filecmp.cmp(expected_outfile, tmpfile, shallow=False)) self.assertEqual(expected_dict, got_dict) os.unlink(tmpfile)
def test_run(self): '''test run''' infile = os.path.join(data_dir, 'cdhit_test_run.in.fa') expected_outfile = os.path.join(data_dir, 'cdhit_test_run.out.fa') tmpfile = 'tmp.cdhit_test_run.out.fa' r = cdhit.Runner(infile, tmpfile) clusters = r.run() expected_clusters = { '0': {'seq1', 'seq2', 'seq3'}, '1': {'seq4'}, } self.assertEqual(clusters, expected_clusters) self.assertTrue(filecmp.cmp(tmpfile, expected_outfile, shallow=False)) os.unlink(tmpfile)
def cluster_with_cdhit(self, outprefix, seq_identity_threshold=0.9, threads=1, length_diff_cutoff=0.0, memory_limit=None, nocluster=False, verbose=False, clusters_file=None): clusters = {} ReferenceData._write_sequences_to_files(self.sequences, self.metadata, outprefix) ref_types = ('noncoding', 'noncoding.varonly', 'gene', 'gene.varonly') for ref_type in ref_types: ref_file = outprefix + '.' + ref_type + '.fa' if os.path.getsize(ref_file) == 0: continue if len(clusters) == 0: min_cluster_number = 0 else: min_cluster_number = 1 + max([int(x) for x in clusters.keys()]) cdhit_runner = cdhit.Runner( ref_file, seq_identity_threshold=seq_identity_threshold, threads=threads, length_diff_cutoff=length_diff_cutoff, memory_limit=memory_limit, verbose=verbose, min_cluster_number=min_cluster_number, ) if clusters_file is not None: new_clusters = cdhit_runner.run_get_clusters_from_file( clusters_file, self.sequences, rename_dict=self.rename_dict) elif nocluster: new_clusters = cdhit_runner.fake_run() else: new_clusters = cdhit_runner.run() clusters.update(new_clusters) self.write_cluster_allocation_file(clusters, outprefix + '.clusters.tsv') return clusters
def test_parse_cluster_info_file(self): '''test _parse_cluster_info_file''' infile = os.path.join(data_dir, 'cdhit_test_parse_cluster_info_file.in.fa') r = cdhit.Runner(infile, 'out') names_dict = {str(i): 'seq' + str(i) for i in range(1, 5)} cluster_representatives = {'1', '4'} cluster_file = os.path.join( data_dir, 'cdhit_test_parse_cluster_info_file.out.fa.bak.clstr') got_clusters, got_reps = r._parse_cluster_info_file( cluster_file, names_dict, cluster_representatives) expected_clusters = {'0': {'seq1', 'seq2', 'seq3'}, '1': {'seq4'}} expected_reps = {'1': '0', '4': '1'} self.assertEqual(expected_clusters, got_clusters) self.assertEqual(expected_reps, got_reps)
def _run_cdhit(self): r = cdhit.Runner( self.db_fasta, self.db_fasta_clustered, seq_identity_threshold=self.cdhit_seq_identity_threshold, threads=self.threads, length_diff_cutoff=self.cdhit_length_diff_cutoff, verbose=self.verbose, ) if self.run_cd_hit: self.cluster_ids = r.run() else: if self.verbose: print('Skipping cd-hit because --no_cdhit option used') self.cluster_ids = r.fake_run()
def test_fake_run_fail(self): '''test fake_run with non-unique names''' infile = os.path.join(data_dir, 'cdhit_test_fake_run.non-unique.in.fa') r = cdhit.Runner(infile) with self.assertRaises(cdhit.Error): r.fake_run()
def test_init_fail_invalid_memory(self): '''test_init_fail_invalid_memory''' infile = os.path.join(data_dir, 'cdhit_test_run.in.fa') with self.assertRaises(cdhit.Error): cdhit.Runner(infile, memory_limit=-10)
def test_init_fail_infile_missing(self): '''test init_fail_infile_missing''' with self.assertRaises(cdhit.Error): cdhit.Runner('oopsnotafile', 'out')