def test_genbank(self): dummy_record = Record(Seq("A" * 100, generic_dna)) clusters = [ create_cluster(3, 20, "prodA"), create_cluster(25, 41, "prodB") ] for cluster in clusters: dummy_record.add_cluster(cluster) subregion = SubRegion(FeatureLocation(35, 71), "test", 0.7) dummy_record.add_subregion(subregion) supercluster = SuperCluster(SuperCluster.kinds.NEIGHBOURING, clusters) dummy_record.add_supercluster(supercluster) region = Region(superclusters=[supercluster], subregions=[subregion]) dummy_record.add_region(region) with NamedTemporaryFile(suffix=".gbk") as output: region.write_to_genbank(output.name) bio = list(seqio.parse(output.name)) assert len(bio) == 1 rec = Record.from_biopython(bio[0], taxon="bacteria") assert len(rec.get_regions()) == 1 new = rec.get_region(0) assert new.location.start == 3 - region.location.start assert new.location.end == 71 - region.location.start assert new.products == region.products assert new.probabilities == region.probabilities
def test_prepeptide_adjustment(self): dummy_record = Record(Seq("A"*400, generic_dna)) subregion = DummySubRegion(start=100, end=300) dummy_record.add_subregion(subregion) region = Region(subregions=[subregion]) dummy_record.add_region(region) dummy_prepeptide = DummyFeature(200, 230, 1, "CDS_motif") # ensure both FeatureLocation and CompoundLocations are handled appropriately leader_loc = FeatureLocation(200, 210, 1) tail_loc = CompoundLocation([FeatureLocation(220, 223, -1), FeatureLocation(227, 230, -1)]) dummy_prepeptide._qualifiers["leader_location"] = [str(leader_loc)] dummy_prepeptide._qualifiers["tail_location"] = [str(tail_loc)] dummy_record.add_feature(dummy_prepeptide) # and add a CDS_motif without either qualifier (e.g. NRPS/PKS motif) to ensure that doesn't break dummy_record.add_feature(DummyFeature(250, 280, 1, "CDS_motif")) with NamedTemporaryFile(suffix=".gbk") as output: region.write_to_genbank(output.name) bio = list(seqio.parse(output.name))[0] assert len(bio.features) == 4 found = False for feature in bio.features: tail = feature.qualifiers.get("tail_location") leader = feature.qualifiers.get("leader_location") if tail and leader: # the part locations should now be adjusted backwards 100 bases assert leader == ["[100:110](+)"] assert tail == ["join{[120:123](-), [127:130](-)}"] found = True assert found, "prepeptide feature missing in conversion"
def test_probabilities(self): loc = FeatureLocation(0, 10) candidates = [DummyCandidateCluster([create_protocluster(0, 10)])] assert Region(candidate_clusters=candidates).probabilities == [] subs = [SubRegion(loc, "testtool", probability=None)] assert Region(candidate_clusters=candidates, subregions=subs).probabilities == [] subs.append(SubRegion(loc, "testtool", probability=0.1)) assert Region(candidate_clusters=candidates, subregions=subs).probabilities == [0.1] subs.append(SubRegion(loc, "testtool", probability=0.7)) assert Region(candidate_clusters=candidates, subregions=subs).probabilities == [0.1, 0.7]
def test_unique_clusters(self): protoclusters = [create_protocluster(i, 10, product=prod) for i, prod in enumerate("ABC")] candidates = [CandidateCluster(CandidateCluster.kinds.INTERLEAVED, protoclusters[:2]), CandidateCluster(CandidateCluster.kinds.INTERLEAVED, protoclusters[1:])] assert protoclusters[1] in candidates[0].protoclusters and protoclusters[1] in candidates[1].protoclusters region = Region(candidate_clusters=candidates) unique_clusters = region.get_unique_protoclusters() # if the protocluster in both candidates is repeated, there'll be an extra assert len(unique_clusters) == 3 assert unique_clusters == protoclusters
def test_probabilities(self): loc = FeatureLocation(0, 10) supers = [ SuperCluster(SuperCluster.kinds.SINGLE, [create_cluster(0, 10)]) ] assert Region(superclusters=supers).probabilities == [] subs = [SubRegion(loc, "testtool", probability=None)] assert Region(superclusters=supers, subregions=subs).probabilities == [] subs.append(SubRegion(loc, "testtool", probability=0.1)) assert Region(superclusters=supers, subregions=subs).probabilities == [0.1] subs.append(SubRegion(loc, "testtool", probability=0.7)) assert Region(superclusters=supers, subregions=subs).probabilities == [0.1, 0.7]
def test_unique_clusters(self): clusters = [ create_cluster(i, 10, product=prod) for i, prod in enumerate("ABC") ] superclusters = [ SuperCluster(SuperCluster.kinds.INTERLEAVED, clusters[:2]), SuperCluster(SuperCluster.kinds.INTERLEAVED, clusters[1:]) ] assert clusters[1] in superclusters[0].clusters and clusters[ 1] in superclusters[1].clusters region = Region(superclusters=superclusters) unique_clusters = region.get_unique_clusters() # if the cluster in both superclusters is repeated, there'll be an extra assert len(unique_clusters) == 3 assert unique_clusters == clusters
def test_limited_add_cds_propagation(self): cds = DummyCDS(0, 10) self.sub = SubRegion(FeatureLocation(20, 30), "testtool") self.region = Region(superclusters=[self.super], subregions=[self.sub]) # ensure all empty to start with assert not self.cluster.cds_children assert not self.super.cds_children assert not self.sub.cds_children assert not self.region.cds_children assert not cds.region self.region.add_cds(cds) assert self.cluster.cds_children == (cds, ) assert self.super.cds_children == (cds, ) assert not self.sub.cds_children assert self.region.cds_children == (cds, ) assert cds.region is self.region
def test_sideloaded(self): clusters = [ create_protocluster(3, 20, "prodA"), SideloadedProtocluster(FeatureLocation(25, 41), FeatureLocation(25, 41), "external", "prodB") ] candidate = CandidateCluster(CandidateCluster.kinds.NEIGHBOURING, clusters) subregions = [ SubRegion(FeatureLocation(35, 71), "test", 0.7), SideloadedSubRegion(FeatureLocation(45, 61), "external") ] region = Region(candidate_clusters=[candidate], subregions=subregions) sideloaded = region.get_sideloaded_areas() assert len(sideloaded) == 2 assert sideloaded[0] is clusters[1] assert sideloaded[1] is subregions[1]
def score_against_protoclusters(label: str, region: Region, hits_by_reference: HitsByReference, query_components: Dict[CDSCollection, Components], mode: Mode) -> VariantResults: """ Performs a protocluster vs protocluster comparison Arguments: label: the name to attach to the results region: the query Region hits_by_reference: a dictionary mapping ReferenceRecord to a dictionary mapping reference CDS name to Hit query_components: a dictionary mapping the region and each contained protocluster to a Components instance with the relevant data mode: the Mode in which to run the analysis Returns: a VariantResults instance """ score_matrix: Dict[int, Dict[ReferenceRegion, Dict[ ReferenceProtocluster, ReferenceScorer]]] = defaultdict(lambda: defaultdict(dict)) reference_best_scores: Dict[Protocluster, Dict[ ReferenceRegion, float]] = defaultdict(lambda: defaultdict(float)) local_hits = filter_by_query_area(region, hits_by_reference) for ref_region in local_hits: hits_for_ref_region = {ref_region: local_hits[ref_region]} for ref_protocluster in ref_region.protoclusters: hits = filter_by_reference_protocluster(ref_protocluster, hits_for_ref_region) for protocluster in region.get_unique_protoclusters(): for scorer in score_query_area(protocluster, hits, query_components[protocluster], mode): score = max( scorer.final_score, reference_best_scores[protocluster][ref_region]) reference_best_scores[protocluster][ref_region] = score score_matrix[protocluster.get_protocluster_number( )][ref_region][ref_protocluster] = scorer reference_total_scores: Dict[ReferenceRegion, float] = defaultdict(float) for ref_region_to_score in reference_best_scores.values(): for ref_region, score in ref_region_to_score.items(): reference_total_scores[ref_region] += score region_ranking = sorted(reference_total_scores.items(), key=lambda x: x[1], reverse=True) region_ranking, score_matrix, best_hits = apply_limits_to_rankings( region_ranking, score_matrix, local_hits) return VariantResults(label, region_ranking, ProtoToProtoScores(score_matrix), best_hits)
def test_products(self): candidates = [DummyCandidateCluster([create_protocluster(0, 10)])] region = Region(candidate_clusters=candidates) assert region.products == ["a"] assert region.get_product_string() == "a" candidates = [] for i, prod in zip(range(2), "ba"): candidates.append(DummyCandidateCluster([create_protocluster(i*10, (i+1)*10, product=prod)])) region = Region(candidate_clusters=candidates) assert region.products == ["b", "a"] assert region.get_product_string() == "a,b"
def test_products(self): supers = [ SuperCluster(SuperCluster.kinds.SINGLE, [create_cluster(0, 10)]) ] region = Region(superclusters=supers) assert region.products == ["a"] assert region.get_product_string() == "a" supers = [] for i, prod in zip(range(2), "ba"): supers.append( SuperCluster( SuperCluster.kinds.SINGLE, [create_cluster(i * 10, (i + 1) * 10, product=prod)])) region = Region(superclusters=supers) assert region.products == ["b", "a"] assert region.get_product_string() == "a-b"
def score_as_protoclusters(label: str, region: Region, hits_by_reference: HitsByReference, query_components: Dict[CDSCollection, Components], mode: Mode) -> VariantResults: """ Performs a protocluster vs reference region comparison Arguments: label: the name to attach to the results region: the query Region hits_by_reference: a dictionary mapping ReferenceRecord to a dictionary mapping reference CDS name to Hit query_components: a dictionary mapping the region and each contained protocluster to a Components instance with the relevant data mode: the Mode in which to run the analysis Returns: a VariantResults instance """ local_hits = filter_by_query_area(region, hits_by_reference) total_scores: Dict[ReferenceRegion, float] = defaultdict(float) scores: Dict[int, Dict[ReferenceRegion, ReferenceScorer]] = defaultdict(dict) for protocluster in region.get_unique_protoclusters(): for scorer in score_query_area(protocluster, local_hits, query_components[protocluster], mode): total_scores[scorer.reference] += calculate_protocluster_ranking( scorer) scores[protocluster.get_protocluster_number()][ scorer.reference] = scorer ranking = sorted(total_scores.items(), key=lambda x: x[1], reverse=True) ranking, scores, best_hits = apply_limits_to_rankings( ranking, scores, local_hits) return VariantResults(label, ranking, ProtoToRegionScores(scores), best_hits)
def test_missing_children(self): with self.assertRaisesRegex(ValueError, "at least one"): Region() with self.assertRaisesRegex(ValueError, "at least one"): Region(superclusters=[], subregions=[])
def test_incorrect_args(self): with self.assertRaises(AssertionError): Region(superclusters=[self.sub]) with self.assertRaises(AssertionError): Region(subregions=[self.super])
def setUp(self): self.cluster = create_cluster(0, 10) self.super = SuperCluster(SuperCluster.kinds.SINGLE, [self.cluster]) self.sub = SubRegion(self.cluster.location, "testtool") self.region = Region(superclusters=[self.super], subregions=[self.sub])
class TestRegionChildren(unittest.TestCase): def setUp(self): self.cluster = create_cluster(0, 10) self.super = SuperCluster(SuperCluster.kinds.SINGLE, [self.cluster]) self.sub = SubRegion(self.cluster.location, "testtool") self.region = Region(superclusters=[self.super], subregions=[self.sub]) def test_children_accessible(self): assert self.region.subregions == (self.sub, ) assert self.region.superclusters == (self.super, ) def test_children_immutable(self): with self.assertRaisesRegex(AttributeError, "can't set attribute"): self.region.subregions = (self.super, ) with self.assertRaisesRegex(AttributeError, "can't set attribute"): self.region.superclusters = (self.sub, ) with self.assertRaisesRegex(AttributeError, "can't set attribute"): self.region.cds_children = [] def test_incorrect_args(self): with self.assertRaises(AssertionError): Region(superclusters=[self.sub]) with self.assertRaises(AssertionError): Region(subregions=[self.super]) def test_missing_children(self): with self.assertRaisesRegex(ValueError, "at least one"): Region() with self.assertRaisesRegex(ValueError, "at least one"): Region(superclusters=[], subregions=[]) def test_add_cds_propagation(self): cds = DummyCDS(0, 10) assert cds.is_contained_by(self.region) # ensure all empty to start with assert not self.cluster.cds_children assert not self.super.cds_children assert not self.sub.cds_children assert not self.region.cds_children assert not cds.region self.region.add_cds(cds) assert self.cluster.cds_children == (cds, ) assert self.super.cds_children == (cds, ) assert self.sub.cds_children == (cds, ) assert self.region.cds_children == (cds, ) assert cds.region is self.region def test_limited_add_cds_propagation(self): cds = DummyCDS(0, 10) self.sub = SubRegion(FeatureLocation(20, 30), "testtool") self.region = Region(superclusters=[self.super], subregions=[self.sub]) # ensure all empty to start with assert not self.cluster.cds_children assert not self.super.cds_children assert not self.sub.cds_children assert not self.region.cds_children assert not cds.region self.region.add_cds(cds) assert self.cluster.cds_children == (cds, ) assert self.super.cds_children == (cds, ) assert not self.sub.cds_children assert self.region.cds_children == (cds, ) assert cds.region is self.region def test_adding_invalid_cds(self): cds = DummyCDS(50, 60) assert not cds.is_contained_by(self.region) with self.assertRaisesRegex(ValueError, "not contained by"): self.region.add_cds(cds) def test_unique_clusters(self): clusters = [ create_cluster(i, 10, product=prod) for i, prod in enumerate("ABC") ] superclusters = [ SuperCluster(SuperCluster.kinds.INTERLEAVED, clusters[:2]), SuperCluster(SuperCluster.kinds.INTERLEAVED, clusters[1:]) ] assert clusters[1] in superclusters[0].clusters and clusters[ 1] in superclusters[1].clusters region = Region(superclusters=superclusters) unique_clusters = region.get_unique_clusters() # if the cluster in both superclusters is repeated, there'll be an extra assert len(unique_clusters) == 3 assert unique_clusters == clusters
def test_incorrect_args(self): with self.assertRaises(AssertionError): Region(candidate_clusters=[self.sub]) with self.assertRaises(AssertionError): Region(subregions=[self.candidate])
def setUp(self): self.protocluster = DummyProtocluster() self.candidate = DummyCandidateCluster([self.protocluster]) self.sub = SubRegion(self.protocluster.location, "testtool") self.region = Region(candidate_clusters=[self.candidate], subregions=[self.sub])