def setUp(self): self.cluster = create_cluster() self.cluster.core_location = FeatureLocation(30, 50) self.inside_cds = DummyCDS(40, 45) self.neighbour_cds = DummyCDS(20, 25) self.outside_cds = DummyCDS(120, 125) assert not self.cluster.cds_children assert not self.cluster.definition_cdses
def test_add_cds(self): collection = CDSCollection(FeatureLocation(20, 40), feature_type="test", child_collections=[]) cds = DummyCDS(20, 40) collection.add_cds(cds) assert cds in collection.cds_children cds = DummyCDS(120, 140) with self.assertRaisesRegex(ValueError, "not contained by"): collection.add_cds(cds)
def setUp(self): self.geneclustergenes = {"CAG25752": ""} self.seq_record = Record("dummy") self.seqlengths = {"CAG25751.1": 253} # used by parse_subject, but only if locus tag not in seqlengths mock('core.get_cds_lengths', returns=self.seqlengths) mock('Record.get_cds_by_name', returns=DummyCDS(1, 301))
def test_add_cds_propagation(self): cds = DummyCDS(0, 10) assert cds.is_contained_by(self.region) # ensure all empty to start with assert not self.cluster.cds_children assert not self.super.cds_children assert not self.sub.cds_children assert not self.region.cds_children assert not cds.region self.region.add_cds(cds) assert self.cluster.cds_children == (cds, ) assert self.super.cds_children == (cds, ) assert self.sub.cds_children == (cds, ) assert self.region.cds_children == (cds, ) assert cds.region is self.region
def create_cds(start, end, products): cds = DummyCDS(start, end, locus_tag="%s-%s-%s" % (start, end, "-".join(products))) for product in products: cds.gene_functions.add(GeneFunction.CORE, "test", "dummy", product) return cds
def parse_subject_wrapper(self, subject_line): # used by core.parse_subject, but only if locus tag not in self.seqlengths with patch.object(core, 'get_cds_lengths', returns=self.seqlengths): with patch.object(Record, 'get_cds_by_name', returns=DummyCDS(1, 301)): return core.parse_subject(subject_line, self.seqlengths, self.seq_record)
def parse_subject_wrapper(self, subject_line): seq_record = Record("dummy") seqlengths = {} # used by core.parse_subject, but only if locus tag not in self.seqlengths with patch.object(core, 'get_cds_lengths', return_value={}): with patch.object(Record, 'get_cds_by_name', return_value=DummyCDS(1, 101)): return core.parse_subject(subject_line, seqlengths, seq_record)
def build(self, early, late, strand=1, tail_strand_multiplier=1): if strand == -1: head = late tail = early second = CDSModuleInfo(DummyCDS(start=50, end=110, strand=strand), [tail]) first = CDSModuleInfo( DummyCDS(start=500, end=560, strand=strand * tail_strand_multiplier), [head]) else: head = early tail = late first = CDSModuleInfo(DummyCDS(start=50, end=110, strand=strand), [head]) second = CDSModuleInfo( DummyCDS(start=500, end=560, strand=strand * tail_strand_multiplier), [tail]) first_modules = list(first.modules) second_modules = list(second.modules) if strand == -1: module = combine_modules(second, first) else: module = combine_modules(first, second) if not module: # nothing should be changed assert first_modules == first.modules assert second_modules == second.modules else: # head is replaced assert head not in first.modules assert len(first_modules) == len(first.modules), (first, second) assert module in first.modules # tail removed assert tail not in second.modules # and not replaced assert len(second_modules) - 1 == len(second.modules) return module
def test_single_file(self): self.add_cdses_to_region([DummyCDS(1, i) for i in range(3, 6)]) with TemporaryDirectory(change=True): files = core.write_fastas_with_all_genes(self.regions, "test.fasta") assert files == ["test.fasta"] assert os.path.exists("test.fasta") expected = "".join(">L{0}\nS{0}\n".format(i) for i in range(len(self.regions) * 3)) assert open("test.fasta").read() == expected
def add_module_references_to_record(module, record): for domain in module.domains: record.add_antismash_domain(domain) try: record.get_cds_by_name(domain.locus_tag) except KeyError: record.add_cds_feature( DummyCDS(start=module.location.start - 10, end=module.location.end + 10, locus_tag=domain.locus_tag))
def test_bad_child(self): with self.assertRaises(AssertionError): child = CDSCollection(FeatureLocation(10, 50), feature_type="test", child_collections=[]) CDSCollection(FeatureLocation(20, 40), feature_type="test", child_collections=[child]) with self.assertRaises(AssertionError): cds = DummyCDS(25, 35) CDSCollection(FeatureLocation(20, 40), feature_type="test", child_collections=[cds])
def test_multi_cds_tracking(self): domains = [DummyAntismashDomain(locus_tag=i) for i in "AB"] module = create_module(domains=domains) assert module.is_multigene_module() record = DummyRecord() add_module_references_to_record(module, record) record.add_cds_feature(DummyCDS(locus_tag="C")) for cds in record.get_cds_features(): assert not cds.modules assert not record.get_modules() record.add_module(module) # make sure it's not added to every CDS assert not record.get_cds_by_name("C").modules # but that it is added to all CDSes with a domain included for i in "AB": assert record.get_cds_by_name(i).modules == (module, )
def test_limited_add_cds_propagation(self): cds = DummyCDS(0, 10) self.sub = SubRegion(FeatureLocation(20, 30), "testtool") self.region = Region(superclusters=[self.super], subregions=[self.sub]) # ensure all empty to start with assert not self.cluster.cds_children assert not self.super.cds_children assert not self.sub.cds_children assert not self.region.cds_children assert not cds.region self.region.add_cds(cds) assert self.cluster.cds_children == (cds, ) assert self.super.cds_children == (cds, ) assert not self.sub.cds_children assert self.region.cds_children == (cds, ) assert cds.region is self.region
def test_multiple_files(self): self.add_cdses_to_region([DummyCDS(1, i) for i in range(3, 6)]) for partitions in [2, 3]: with TemporaryDirectory(change=True): self.index = 0 chunk_size = (len(self.regions) * 3) // partitions files = core.write_fastas_with_all_genes(self.regions, "test.fasta", partitions=partitions) assert files == ["test%d.fasta" % i for i in range(partitions)] for index in range(partitions): assert os.path.exists("test%d.fasta" % index) print(index, chunk_size) contents = open("test%d.fasta" % index).read() assert contents.count(">") == chunk_size expected = "".join( ">L{0}\nS{0}\n".format(i + index * chunk_size) for i in range(chunk_size)) assert contents == expected
def test_adding_invalid_cds(self): cds = DummyCDS(50, 60) assert not cds.is_contained_by(self.region) with self.assertRaisesRegex(ValueError, "not contained by"): self.region.add_cds(cds)
def test_parents(self): cds = DummyCDS(0, 6, locus_tag="testCDS") domain = DummyAntismashDomain(2, 5) domain.locus_tag = "testCDS" module = create_module([domain]) assert module.parent_cds_names[0] == cds.get_name()
class TestBlastParsing(unittest.TestCase): def setUp(self): self.sample_data = self.read_sample_data() self.sample_data_as_lists = self.file_data_to_lists(self.sample_data) def parse_subject_wrapper(self, subject_line): seq_record = Record("dummy") seqlengths = {} # used by core.parse_subject, but only if locus tag not in self.seqlengths with patch.object(core, 'get_cds_lengths', return_value={}): with patch.object(Record, 'get_cds_by_name', return_value=DummyCDS(1, 101)): return core.parse_subject(subject_line, seqlengths, seq_record) def read_sample_data(self, filename="data/diamond_output_sample.txt"): data_path = os.path.join(__file__.rsplit(os.sep, 1)[0], filename) return open(data_path, "r").read() def file_data_to_lists(self, data): return [line.split("\t") for line in data.rstrip().split("\n")] def test_unique_pairings_filter(self): data = self.file_data_to_lists(self.sample_data) sample = core.remove_duplicate_hits(data) self.assertEqual(len(sample), len(data)) self.assertEqual(sample, core.remove_duplicate_hits(data * 2)) # test empty data = [[], ["a"], ["abc"]] results = core.remove_duplicate_hits(data) self.assertEqual(results, []) def verify_subjects_and_clusters_represented(self, subjects, cluster_name_to_queries): subject_clusters = set() for subject in subjects: self.assertTrue(subject.genecluster in cluster_name_to_queries) subject_clusters.add(subject.genecluster) self.assertEqual(sorted(subject_clusters), sorted(cluster_name_to_queries)) @patch.object(core, 'get_cds_lengths', return_value={}) @patch.object(Record, 'get_cds_by_name', return_value=DummyCDS(1, 101)) def test_blastparse(self, _mocked_record, _mocked_core): queries, clusters = core.blastparse(self.sample_data, Record(), 0, 0) # check we process the right number of queries self.assertEqual(len(queries), len(set(i[0] for i in self.sample_data_as_lists))) # check we have entries for every gene_cluster we found subjects = [ self.parse_subject_wrapper(i) for i in self.sample_data_as_lists ] self.verify_subjects_and_clusters_represented(subjects, clusters) # test perc_coverage threshold (value arbitrary due to mocking) coverage_threshold = 650 queries, clusters = core.blastparse(self.sample_data, Record(), coverage_threshold, 0) new_subjects = [ s for s in subjects if s.perc_coverage > coverage_threshold ] assert new_subjects and len(new_subjects) < len( subjects), "coverage test has become meaningless" self.verify_subjects_and_clusters_represented(new_subjects, clusters) # test perc_identity threshold ident_threshold = 35 queries, clusters = core.blastparse(self.sample_data, Record(), 0, ident_threshold) new_subjects = [s for s in subjects if s.perc_ident > ident_threshold] assert new_subjects and len(new_subjects) < len( subjects), "identity% test has become meaningless" self.verify_subjects_and_clusters_represented(new_subjects, clusters) # test combo threshold queries, clusters = core.blastparse(self.sample_data, Record(), coverage_threshold, ident_threshold) new_subjects = [ s for s in subjects if s.perc_ident > ident_threshold and s.perc_coverage > coverage_threshold ] assert new_subjects and len(new_subjects) < len( subjects), "combo test has become meaningless" self.verify_subjects_and_clusters_represented(new_subjects, clusters) def test_blastparse_on_empty(self): for blast in ["", "\n", "\r\n", "\n\n"]: queries, clusters = core.blastparse(blast, Record(), 0, 0) self.assertEqual(len(queries), 0) self.assertEqual(len(clusters), 0) @patch.object(core, 'get_cds_lengths', return_value={}) @patch.object(Record, 'get_cds_by_name', return_value=DummyCDS(1, 101)) def test_parse_all_single_cluster(self, _mocked_record, _mocked_core): # single cluster to test the thresholds and content def parse_all_wrapper(coverage_threshold, ident_threshold): clusters_by_number, queries_by_number = core.parse_all_clusters( self.sample_data, Record(), coverage_threshold, ident_threshold) # make sure we only found one cluster number self.assertEqual(len(clusters_by_number), 1) self.assertEqual(list(clusters_by_number), [24]) self.assertEqual(len(queries_by_number), 1) self.assertEqual(list(queries_by_number), [24]) # now test the values of those queries queries = queries_by_number[24] clusters = clusters_by_number[24] return queries, clusters queries, clusters = parse_all_wrapper(0, 0) # check we process the right number of queries self.assertEqual(len(queries), len(set(i[0] for i in self.sample_data_as_lists))) # check we have entries for every gene_cluster we found subjects = [ self.parse_subject_wrapper(i) for i in self.sample_data_as_lists ] self.verify_subjects_and_clusters_represented(subjects, clusters) # test perc_coverage threshold (value arbitrary due to mocking) coverage_threshold = 650 queries, clusters = parse_all_wrapper(coverage_threshold, 0) new_subjects = [ s for s in subjects if s.perc_coverage > coverage_threshold ] assert new_subjects and len(new_subjects) < len( subjects), "coverage test has become meaningless" self.verify_subjects_and_clusters_represented(new_subjects, clusters) # test perc_identity threshold ident_threshold = 35 queries, clusters = parse_all_wrapper(0, ident_threshold) new_subjects = [s for s in subjects if s.perc_ident > ident_threshold] assert new_subjects and len(new_subjects) < len( subjects), "identity% test has become meaningless" self.verify_subjects_and_clusters_represented(new_subjects, clusters) # test combo threshold queries, clusters = parse_all_wrapper(coverage_threshold, ident_threshold) new_subjects = [ s for s in subjects if s.perc_ident > ident_threshold and s.perc_coverage > coverage_threshold ] assert new_subjects and len(new_subjects) < len( subjects), "combo test has become meaningless" self.verify_subjects_and_clusters_represented(new_subjects, clusters) @patch.object(core, 'get_cds_lengths', return_value={}) @patch.object(Record, 'get_cds_by_name', return_value=DummyCDS(1, 101)) def test_parse_all_multi_cluster(self, _mocked_record, _mocked_core): # test we partition correctly by cluster number sample_data = self.read_sample_data( "data/diamond_output_sample_multicluster.txt") clusters_by_number, queries_by_number = core.parse_all_clusters( sample_data, Record(), 0, 0) self.assertEqual(len(clusters_by_number), 3) self.assertEqual(sorted(clusters_by_number), [1, 2, 4]) self.assertEqual(len(queries_by_number), 3) self.assertEqual(sorted(queries_by_number), [1, 2, 4]) for i in [1, 2, 4]: self.assertEqual(len(clusters_by_number[i]), i) self.assertEqual(len(queries_by_number[i]), i) def test_parse_all_empty(self): for sample_data in ["", "\n", "\r\n", "\n\n"]: clusters, queries = core.parse_all_clusters( sample_data, Record(), 0, 0) self.assertEqual(len(clusters), 0) self.assertEqual(len(queries), 0)
def setUp(self): # used by parse_subject, every sequence will be 100 long mock('Record.get_cds_by_name', returns=DummyCDS(1, 101)) mock('core.get_cds_lengths', returns={}) self.sample_data = self.read_sample_data() self.sample_data_as_lists = self.file_data_to_lists(self.sample_data)
def test_missing_modules(self): missing_modules = CDSModuleInfo(DummyCDS(start=50, end=110), []) has_modules = CDSModuleInfo(DummyCDS(start=150, end=210), [self.generic_tail]) assert not combine_modules(missing_modules, has_modules) assert not combine_modules(has_modules, missing_modules)