Exemplo n.º 1
0
 def test_genbank(self):
     dummy_record = Record(Seq("A" * 100, generic_dna))
     clusters = [
         create_cluster(3, 20, "prodA"),
         create_cluster(25, 41, "prodB")
     ]
     for cluster in clusters:
         dummy_record.add_cluster(cluster)
     subregion = SubRegion(FeatureLocation(35, 71), "test", 0.7)
     dummy_record.add_subregion(subregion)
     supercluster = SuperCluster(SuperCluster.kinds.NEIGHBOURING, clusters)
     dummy_record.add_supercluster(supercluster)
     region = Region(superclusters=[supercluster], subregions=[subregion])
     dummy_record.add_region(region)
     with NamedTemporaryFile(suffix=".gbk") as output:
         region.write_to_genbank(output.name)
         bio = list(seqio.parse(output.name))
     assert len(bio) == 1
     rec = Record.from_biopython(bio[0], taxon="bacteria")
     assert len(rec.get_regions()) == 1
     new = rec.get_region(0)
     assert new.location.start == 3 - region.location.start
     assert new.location.end == 71 - region.location.start
     assert new.products == region.products
     assert new.probabilities == region.probabilities
Exemplo n.º 2
0
    def test_prepeptide_adjustment(self):
        dummy_record = Record(Seq("A"*400, generic_dna))
        subregion = DummySubRegion(start=100, end=300)
        dummy_record.add_subregion(subregion)
        region = Region(subregions=[subregion])
        dummy_record.add_region(region)

        dummy_prepeptide = DummyFeature(200, 230, 1, "CDS_motif")
        # ensure both FeatureLocation and CompoundLocations are handled appropriately
        leader_loc = FeatureLocation(200, 210, 1)
        tail_loc = CompoundLocation([FeatureLocation(220, 223, -1), FeatureLocation(227, 230, -1)])
        dummy_prepeptide._qualifiers["leader_location"] = [str(leader_loc)]
        dummy_prepeptide._qualifiers["tail_location"] = [str(tail_loc)]
        dummy_record.add_feature(dummy_prepeptide)
        # and add a CDS_motif without either qualifier (e.g. NRPS/PKS motif) to ensure that doesn't break
        dummy_record.add_feature(DummyFeature(250, 280, 1, "CDS_motif"))

        with NamedTemporaryFile(suffix=".gbk") as output:
            region.write_to_genbank(output.name)
            bio = list(seqio.parse(output.name))[0]
        assert len(bio.features) == 4
        found = False
        for feature in bio.features:
            tail = feature.qualifiers.get("tail_location")
            leader = feature.qualifiers.get("leader_location")
            if tail and leader:
                # the part locations should now be adjusted backwards 100 bases
                assert leader == ["[100:110](+)"]
                assert tail == ["join{[120:123](-), [127:130](-)}"]
                found = True
        assert found, "prepeptide feature missing in conversion"
Exemplo n.º 3
0
 def test_probabilities(self):
     loc = FeatureLocation(0, 10)
     candidates = [DummyCandidateCluster([create_protocluster(0, 10)])]
     assert Region(candidate_clusters=candidates).probabilities == []
     subs = [SubRegion(loc, "testtool", probability=None)]
     assert Region(candidate_clusters=candidates, subregions=subs).probabilities == []
     subs.append(SubRegion(loc, "testtool", probability=0.1))
     assert Region(candidate_clusters=candidates, subregions=subs).probabilities == [0.1]
     subs.append(SubRegion(loc, "testtool", probability=0.7))
     assert Region(candidate_clusters=candidates, subregions=subs).probabilities == [0.1, 0.7]
Exemplo n.º 4
0
 def test_unique_clusters(self):
     protoclusters = [create_protocluster(i, 10, product=prod) for i, prod in enumerate("ABC")]
     candidates = [CandidateCluster(CandidateCluster.kinds.INTERLEAVED, protoclusters[:2]),
                   CandidateCluster(CandidateCluster.kinds.INTERLEAVED, protoclusters[1:])]
     assert protoclusters[1] in candidates[0].protoclusters and protoclusters[1] in candidates[1].protoclusters
     region = Region(candidate_clusters=candidates)
     unique_clusters = region.get_unique_protoclusters()
     # if the protocluster in both candidates is repeated, there'll be an extra
     assert len(unique_clusters) == 3
     assert unique_clusters == protoclusters
Exemplo n.º 5
0
 def test_probabilities(self):
     loc = FeatureLocation(0, 10)
     supers = [
         SuperCluster(SuperCluster.kinds.SINGLE, [create_cluster(0, 10)])
     ]
     assert Region(superclusters=supers).probabilities == []
     subs = [SubRegion(loc, "testtool", probability=None)]
     assert Region(superclusters=supers,
                   subregions=subs).probabilities == []
     subs.append(SubRegion(loc, "testtool", probability=0.1))
     assert Region(superclusters=supers,
                   subregions=subs).probabilities == [0.1]
     subs.append(SubRegion(loc, "testtool", probability=0.7))
     assert Region(superclusters=supers,
                   subregions=subs).probabilities == [0.1, 0.7]
Exemplo n.º 6
0
 def test_unique_clusters(self):
     clusters = [
         create_cluster(i, 10, product=prod) for i, prod in enumerate("ABC")
     ]
     superclusters = [
         SuperCluster(SuperCluster.kinds.INTERLEAVED, clusters[:2]),
         SuperCluster(SuperCluster.kinds.INTERLEAVED, clusters[1:])
     ]
     assert clusters[1] in superclusters[0].clusters and clusters[
         1] in superclusters[1].clusters
     region = Region(superclusters=superclusters)
     unique_clusters = region.get_unique_clusters()
     # if the cluster in both superclusters is repeated, there'll be an extra
     assert len(unique_clusters) == 3
     assert unique_clusters == clusters
Exemplo n.º 7
0
    def test_limited_add_cds_propagation(self):
        cds = DummyCDS(0, 10)
        self.sub = SubRegion(FeatureLocation(20, 30), "testtool")
        self.region = Region(superclusters=[self.super], subregions=[self.sub])

        # ensure all empty to start with
        assert not self.cluster.cds_children
        assert not self.super.cds_children
        assert not self.sub.cds_children
        assert not self.region.cds_children
        assert not cds.region

        self.region.add_cds(cds)
        assert self.cluster.cds_children == (cds, )
        assert self.super.cds_children == (cds, )
        assert not self.sub.cds_children
        assert self.region.cds_children == (cds, )
        assert cds.region is self.region
Exemplo n.º 8
0
    def test_sideloaded(self):
        clusters = [
            create_protocluster(3, 20, "prodA"),
            SideloadedProtocluster(FeatureLocation(25, 41),
                                   FeatureLocation(25, 41), "external",
                                   "prodB")
        ]
        candidate = CandidateCluster(CandidateCluster.kinds.NEIGHBOURING,
                                     clusters)

        subregions = [
            SubRegion(FeatureLocation(35, 71), "test", 0.7),
            SideloadedSubRegion(FeatureLocation(45, 61), "external")
        ]

        region = Region(candidate_clusters=[candidate], subregions=subregions)
        sideloaded = region.get_sideloaded_areas()
        assert len(sideloaded) == 2
        assert sideloaded[0] is clusters[1]
        assert sideloaded[1] is subregions[1]
Exemplo n.º 9
0
def score_against_protoclusters(label: str, region: Region,
                                hits_by_reference: HitsByReference,
                                query_components: Dict[CDSCollection,
                                                       Components],
                                mode: Mode) -> VariantResults:
    """ Performs a protocluster vs protocluster comparison

        Arguments:
            label: the name to attach to the results
            region: the query Region
            hits_by_reference: a dictionary mapping ReferenceRecord to
                                a dictionary mapping reference CDS name to Hit
            query_components: a dictionary mapping the region and each contained protocluster to
                                a Components instance with the relevant data
            mode: the Mode in which to run the analysis

        Returns:
            a VariantResults instance
    """
    score_matrix: Dict[int, Dict[ReferenceRegion, Dict[
        ReferenceProtocluster,
        ReferenceScorer]]] = defaultdict(lambda: defaultdict(dict))
    reference_best_scores: Dict[Protocluster, Dict[
        ReferenceRegion, float]] = defaultdict(lambda: defaultdict(float))
    local_hits = filter_by_query_area(region, hits_by_reference)
    for ref_region in local_hits:
        hits_for_ref_region = {ref_region: local_hits[ref_region]}
        for ref_protocluster in ref_region.protoclusters:
            hits = filter_by_reference_protocluster(ref_protocluster,
                                                    hits_for_ref_region)
            for protocluster in region.get_unique_protoclusters():
                for scorer in score_query_area(protocluster, hits,
                                               query_components[protocluster],
                                               mode):
                    score = max(
                        scorer.final_score,
                        reference_best_scores[protocluster][ref_region])
                    reference_best_scores[protocluster][ref_region] = score
                    score_matrix[protocluster.get_protocluster_number(
                    )][ref_region][ref_protocluster] = scorer

    reference_total_scores: Dict[ReferenceRegion, float] = defaultdict(float)
    for ref_region_to_score in reference_best_scores.values():
        for ref_region, score in ref_region_to_score.items():
            reference_total_scores[ref_region] += score

    region_ranking = sorted(reference_total_scores.items(),
                            key=lambda x: x[1],
                            reverse=True)

    region_ranking, score_matrix, best_hits = apply_limits_to_rankings(
        region_ranking, score_matrix, local_hits)
    return VariantResults(label, region_ranking,
                          ProtoToProtoScores(score_matrix), best_hits)
Exemplo n.º 10
0
    def test_products(self):
        candidates = [DummyCandidateCluster([create_protocluster(0, 10)])]
        region = Region(candidate_clusters=candidates)
        assert region.products == ["a"]
        assert region.get_product_string() == "a"

        candidates = []
        for i, prod in zip(range(2), "ba"):
            candidates.append(DummyCandidateCluster([create_protocluster(i*10, (i+1)*10, product=prod)]))
        region = Region(candidate_clusters=candidates)
        assert region.products == ["b", "a"]
        assert region.get_product_string() == "a,b"
Exemplo n.º 11
0
    def test_products(self):
        supers = [
            SuperCluster(SuperCluster.kinds.SINGLE, [create_cluster(0, 10)])
        ]
        region = Region(superclusters=supers)
        assert region.products == ["a"]
        assert region.get_product_string() == "a"

        supers = []
        for i, prod in zip(range(2), "ba"):
            supers.append(
                SuperCluster(
                    SuperCluster.kinds.SINGLE,
                    [create_cluster(i * 10, (i + 1) * 10, product=prod)]))
        region = Region(superclusters=supers)
        assert region.products == ["b", "a"]
        assert region.get_product_string() == "a-b"
Exemplo n.º 12
0
def score_as_protoclusters(label: str, region: Region,
                           hits_by_reference: HitsByReference,
                           query_components: Dict[CDSCollection, Components],
                           mode: Mode) -> VariantResults:
    """ Performs a protocluster vs reference region comparison

        Arguments:
            label: the name to attach to the results
            region: the query Region
            hits_by_reference: a dictionary mapping ReferenceRecord to
                                a dictionary mapping reference CDS name to Hit
            query_components: a dictionary mapping the region and each contained protocluster to
                                a Components instance with the relevant data
            mode: the Mode in which to run the analysis

        Returns:
            a VariantResults instance
    """
    local_hits = filter_by_query_area(region, hits_by_reference)

    total_scores: Dict[ReferenceRegion, float] = defaultdict(float)

    scores: Dict[int, Dict[ReferenceRegion,
                           ReferenceScorer]] = defaultdict(dict)
    for protocluster in region.get_unique_protoclusters():
        for scorer in score_query_area(protocluster, local_hits,
                                       query_components[protocluster], mode):
            total_scores[scorer.reference] += calculate_protocluster_ranking(
                scorer)
            scores[protocluster.get_protocluster_number()][
                scorer.reference] = scorer

    ranking = sorted(total_scores.items(), key=lambda x: x[1], reverse=True)
    ranking, scores, best_hits = apply_limits_to_rankings(
        ranking, scores, local_hits)
    return VariantResults(label, ranking, ProtoToRegionScores(scores),
                          best_hits)
Exemplo n.º 13
0
 def test_missing_children(self):
     with self.assertRaisesRegex(ValueError, "at least one"):
         Region()
     with self.assertRaisesRegex(ValueError, "at least one"):
         Region(superclusters=[], subregions=[])
Exemplo n.º 14
0
 def test_incorrect_args(self):
     with self.assertRaises(AssertionError):
         Region(superclusters=[self.sub])
     with self.assertRaises(AssertionError):
         Region(subregions=[self.super])
Exemplo n.º 15
0
 def setUp(self):
     self.cluster = create_cluster(0, 10)
     self.super = SuperCluster(SuperCluster.kinds.SINGLE, [self.cluster])
     self.sub = SubRegion(self.cluster.location, "testtool")
     self.region = Region(superclusters=[self.super], subregions=[self.sub])
Exemplo n.º 16
0
class TestRegionChildren(unittest.TestCase):
    def setUp(self):
        self.cluster = create_cluster(0, 10)
        self.super = SuperCluster(SuperCluster.kinds.SINGLE, [self.cluster])
        self.sub = SubRegion(self.cluster.location, "testtool")
        self.region = Region(superclusters=[self.super], subregions=[self.sub])

    def test_children_accessible(self):
        assert self.region.subregions == (self.sub, )
        assert self.region.superclusters == (self.super, )

    def test_children_immutable(self):
        with self.assertRaisesRegex(AttributeError, "can't set attribute"):
            self.region.subregions = (self.super, )
        with self.assertRaisesRegex(AttributeError, "can't set attribute"):
            self.region.superclusters = (self.sub, )
        with self.assertRaisesRegex(AttributeError, "can't set attribute"):
            self.region.cds_children = []

    def test_incorrect_args(self):
        with self.assertRaises(AssertionError):
            Region(superclusters=[self.sub])
        with self.assertRaises(AssertionError):
            Region(subregions=[self.super])

    def test_missing_children(self):
        with self.assertRaisesRegex(ValueError, "at least one"):
            Region()
        with self.assertRaisesRegex(ValueError, "at least one"):
            Region(superclusters=[], subregions=[])

    def test_add_cds_propagation(self):
        cds = DummyCDS(0, 10)
        assert cds.is_contained_by(self.region)
        # ensure all empty to start with
        assert not self.cluster.cds_children
        assert not self.super.cds_children
        assert not self.sub.cds_children
        assert not self.region.cds_children
        assert not cds.region

        self.region.add_cds(cds)
        assert self.cluster.cds_children == (cds, )
        assert self.super.cds_children == (cds, )
        assert self.sub.cds_children == (cds, )
        assert self.region.cds_children == (cds, )
        assert cds.region is self.region

    def test_limited_add_cds_propagation(self):
        cds = DummyCDS(0, 10)
        self.sub = SubRegion(FeatureLocation(20, 30), "testtool")
        self.region = Region(superclusters=[self.super], subregions=[self.sub])

        # ensure all empty to start with
        assert not self.cluster.cds_children
        assert not self.super.cds_children
        assert not self.sub.cds_children
        assert not self.region.cds_children
        assert not cds.region

        self.region.add_cds(cds)
        assert self.cluster.cds_children == (cds, )
        assert self.super.cds_children == (cds, )
        assert not self.sub.cds_children
        assert self.region.cds_children == (cds, )
        assert cds.region is self.region

    def test_adding_invalid_cds(self):
        cds = DummyCDS(50, 60)
        assert not cds.is_contained_by(self.region)
        with self.assertRaisesRegex(ValueError, "not contained by"):
            self.region.add_cds(cds)

    def test_unique_clusters(self):
        clusters = [
            create_cluster(i, 10, product=prod) for i, prod in enumerate("ABC")
        ]
        superclusters = [
            SuperCluster(SuperCluster.kinds.INTERLEAVED, clusters[:2]),
            SuperCluster(SuperCluster.kinds.INTERLEAVED, clusters[1:])
        ]
        assert clusters[1] in superclusters[0].clusters and clusters[
            1] in superclusters[1].clusters
        region = Region(superclusters=superclusters)
        unique_clusters = region.get_unique_clusters()
        # if the cluster in both superclusters is repeated, there'll be an extra
        assert len(unique_clusters) == 3
        assert unique_clusters == clusters
Exemplo n.º 17
0
 def test_incorrect_args(self):
     with self.assertRaises(AssertionError):
         Region(candidate_clusters=[self.sub])
     with self.assertRaises(AssertionError):
         Region(subregions=[self.candidate])
Exemplo n.º 18
0
 def setUp(self):
     self.protocluster = DummyProtocluster()
     self.candidate = DummyCandidateCluster([self.protocluster])
     self.sub = SubRegion(self.protocluster.location, "testtool")
     self.region = Region(candidate_clusters=[self.candidate],
                          subregions=[self.sub])