Пример #1
0
    def test_hybrid_interactions(self):
        cluster = create_cluster(3, 8, 171, 176, "a")
        hybrid = create_cluster(3, 8, 50, 55, "b")
        contained = create_cluster(80, 90, 100, 110,
                                   "c")  # will form part of hybrid

        hybrid_cds = create_cds(8, 50, ["a", "b"])
        cluster.add_cds(hybrid_cds)
        hybrid.add_cds(hybrid_cds)

        for overlapping in [
                create_cluster(120, 130, 200, 250, "d"),
                create_cluster(60, 70, 200, 250, "d")
        ]:

            created = creator([cluster, hybrid, contained, overlapping])

            assert len(created) == 2
            assert created[0].location == FeatureLocation(3, 250)
            assert created[0].kind == CandidateCluster.kinds.INTERLEAVED
            assert created[0].protoclusters == tuple(
                sorted([cluster, hybrid, contained, overlapping]))

            assert created[1].location == FeatureLocation(3, 176)
            assert created[1].kind == CandidateCluster.kinds.CHEMICAL_HYBRID
            assert created[1].protoclusters == (cluster, hybrid, contained)
Пример #2
0
    def test_prepeptide_adjustment(self):
        dummy_record = Record(Seq("A"*400, generic_dna))
        subregion = DummySubRegion(start=100, end=300)
        dummy_record.add_subregion(subregion)
        region = Region(subregions=[subregion])
        dummy_record.add_region(region)

        dummy_prepeptide = DummyFeature(200, 230, 1, "CDS_motif")
        # ensure both FeatureLocation and CompoundLocations are handled appropriately
        leader_loc = FeatureLocation(200, 210, 1)
        tail_loc = CompoundLocation([FeatureLocation(220, 223, -1), FeatureLocation(227, 230, -1)])
        dummy_prepeptide._qualifiers["leader_location"] = [str(leader_loc)]
        dummy_prepeptide._qualifiers["tail_location"] = [str(tail_loc)]
        dummy_record.add_feature(dummy_prepeptide)
        # and add a CDS_motif without either qualifier (e.g. NRPS/PKS motif) to ensure that doesn't break
        dummy_record.add_feature(DummyFeature(250, 280, 1, "CDS_motif"))

        with NamedTemporaryFile(suffix=".gbk") as output:
            region.write_to_genbank(output.name)
            bio = list(seqio.parse(output.name))[0]
        assert len(bio.features) == 4
        found = False
        for feature in bio.features:
            tail = feature.qualifiers.get("tail_location")
            leader = feature.qualifiers.get("leader_location")
            if tail and leader:
                # the part locations should now be adjusted backwards 100 bases
                assert leader == ["[100:110](+)"]
                assert tail == ["join{[120:123](-), [127:130](-)}"]
                found = True
        assert found, "prepeptide feature missing in conversion"
    def test_conversion(self):
        core = FeatureLocation(8, 71, strand=1)
        surrounds = FeatureLocation(3, 76, strand=1)
        extras = {"a": ["5", "c"], "b": ["something"]}
        source = SideloadedProtocluster(core,
                                        surrounds,
                                        "tool name",
                                        "some-product",
                                        extra_qualifiers=extras)

        assert source.neighbourhood_range == 5

        bio_features = source.to_biopython()
        assert len(bio_features) == 2
        for key, val in extras.items():
            assert bio_features[0].qualifiers[key] == val

        for regenerator in [SideloadedProtocluster, Protocluster]:
            dest = regenerator.from_biopython(bio_features[0])
            assert isinstance(dest, SideloadedProtocluster)
            assert dest.extra_qualifiers == source.extra_qualifiers == extras
            assert dest.tool == source.tool
            assert dest.product == source.product
            assert dest.location == source.location
            assert dest.core_location == source.core_location
            assert dest.neighbourhood_range == source.neighbourhood_range

            for key, val in extras.items():
                assert not dest.get_qualifier(key)
Пример #4
0
def create_cluster(start, end, product='a'):
    return Cluster(FeatureLocation(start, end),
                   FeatureLocation(start, end),
                   tool="testing",
                   product=product,
                   cutoff=1,
                   neighbourhood_range=0,
                   detection_rule="some rule text")
Пример #5
0
def create_cluster():
    cluster = Cluster(FeatureLocation(8, 71, strand=1),
                      FeatureLocation(3, 76, strand=1),
                      tool="test",
                      cutoff=17,
                      neighbourhood_range=5,
                      product='a',
                      detection_rule="some rule text")
    return cluster
Пример #6
0
def create_cluster(n_start, start, end, n_end, product='a'):
    cluster = Cluster(FeatureLocation(start, end),
                      FeatureLocation(n_start, n_end),
                      tool="testing",
                      product=product,
                      cutoff=1,
                      neighbourhood_range=0,
                      detection_rule="some rule text")
    cds = create_cds(start, end, [product])
    cluster.add_cds(cds)
    return cluster
Пример #7
0
    def test_edge_overlap_before(self):
        cds = self.create_cds(9000, 10000, profiles=["l.edge"])
        self.record.add_cds_feature(cds)
        assert utils.distance_to_pfam(self.record, self.query,
                                      ["l.edge"]) == -1

        cds.location = FeatureLocation(9000, 10001, strand=1)
        assert utils.distance_to_pfam(self.record, self.query,
                                      ["l.edge"]) == 39999

        cds.location = FeatureLocation(9000, 10001, strand=-1)
        assert utils.distance_to_pfam(self.record, self.query,
                                      ["l.edge"]) == 39999
Пример #8
0
 def test_construction(self):
     loc = FeatureLocation(1, 15, 1)
     protein_location = FeatureLocation(0, 3)
     domain = Domain(loc,
                     "test_type",
                     tool="test",
                     protein_location=protein_location,
                     locus_tag="locus")
     assert domain.type == "test_type"
     assert domain.location == loc
     assert domain.created_by_antismash
     assert domain.tool == "test"
     assert domain.domain is None
     assert domain.protein_location == protein_location
Пример #9
0
    def test_interleaving(self):
        # these first two hybrid clumps should be interleaved
        first_hybrid_clusters = [
            create_cluster(30, 60, 120, 150, "a"),
            create_cluster(60, 90, 150, 180, "b")
        ]
        cds = create_cds(90, 120, ["a", "b"])
        for cluster in first_hybrid_clusters:
            cluster.add_cds(cds)

        second_hybrid_clusters = [
            create_cluster(90, 120, 250, 280, "c"),
            create_cluster(190, 220, 280, 310, "d")
        ]
        cds = create_cds(220, 250, ["c", "d"])
        for cluster in second_hybrid_clusters:
            cluster.add_cds(cds)

        # this non-hybrid should also be included in the interleaved
        single = create_cluster(230, 250, 410, 430, "e")
        # this hybrid should not
        standalone = [
            create_cluster(1000, 1100, 1400, 1500, "f"),
            create_cluster(1100, 1200, 1500, 1600, "g")
        ]
        cds = create_cds(1300, 1400, ["f", "g"])
        for cluster in standalone:
            cluster.add_cds(cds)

        created = creator(first_hybrid_clusters + second_hybrid_clusters +
                          [single] + standalone)

        assert len(created) == 4
        assert created[0].location == FeatureLocation(30, 430)
        assert created[0].core_location == FeatureLocation(60, 410)
        assert created[0].kind == CandidateCluster.kinds.INTERLEAVED
        assert created[0].protoclusters == tuple(first_hybrid_clusters +
                                                 second_hybrid_clusters +
                                                 [single])

        assert created[1].location == FeatureLocation(30, 180)
        assert created[1].protoclusters == tuple(first_hybrid_clusters)
        assert created[2].location == FeatureLocation(90, 310)
        assert created[2].protoclusters == tuple(second_hybrid_clusters)
        for cand in created[1:3]:
            assert cand.kind == CandidateCluster.kinds.CHEMICAL_HYBRID

        assert created[3].location == FeatureLocation(1000, 1600)
        assert created[3].kind == CandidateCluster.kinds.CHEMICAL_HYBRID
Пример #10
0
 def test_interleaving_order(self):
     clusters = [
         create_cluster(1000, 1100, 1400, 1500, "a"),
         create_cluster(1050, 2000, 3000, 4000,
                        "b"),  # sorts second due to neighbouring
         create_cluster(1100, 1200, 1500, 1600, "c")
     ]
     assert sorted(clusters) == clusters
     created = creator(clusters)
     assert len(created) == 3
     assert created[0].kind == CandidateCluster.kinds.NEIGHBOURING
     assert created[0].location == FeatureLocation(1000, 4000)
     assert created[1].kind == CandidateCluster.kinds.INTERLEAVED
     assert created[1].location == FeatureLocation(1000, 1600)
     assert created[2].kind == CandidateCluster.kinds.SINGLE
     assert created[2].location == FeatureLocation(1050, 4000)
Пример #11
0
def create_subregions(anchor: str, cluster_preds: List[ClusterPrediction],
                      record: Record) -> List[SubRegion]:
    """ Create the predicted subregions """
    subregions = []  # type: List[SubRegion]
    if not cluster_preds:
        return subregions
    for i, cluster in enumerate(cluster_preds):
        # clusters returned by hmmdetect are based on CDS features
        # in contrast, subregions returned by cassis are based on gene features
        # --> hmmdetect derived clusters have exact loctions, like the CDSs have
        # --> cassis derived subregions may have fuzzy locations, like the genes have
        left_name = cluster.start.gene
        right_name = cluster.end.gene
        left = None
        right = None
        for gene in record.get_genes():
            if gene.get_name() == left_name:
                left = gene
            if gene.get_name() == right_name:
                right = gene
            if left and right:
                break
        assert left and right, "boundary genes no longer present in Record"
        new_feature = SeqFeature(FeatureLocation(left.location.start,
                                                 right.location.end),
                                 type="subregion")
        new_feature.qualifiers = {
            "aStool": ["cassis"],
            "anchor": [anchor],
            "abundance": [cluster.start.abundance + cluster.end.abundance],
            "motif_score":
            ["{:.1e}".format(cluster.start.score + cluster.end.score)],
            "gene_left": [cluster.start.gene],
            "promoter_left": [cluster.start.promoter],
            "abundance_left": [cluster.start.abundance],
            "motif_left": [cluster.start.pairing_string],
            "motif_score_left": ["{:.1e}".format(cluster.start.score)],
            "gene_right": [cluster.end.gene],
            "promoter_right": [cluster.end.promoter],
            "abundance_right": [cluster.end.abundance],
            "motif_right": [cluster.end.pairing_string],
            "motif_score_right": ["{:.1e}".format(cluster.end.score)],
            "genes": [cluster.genes],
            "promoters": [cluster.promoters],
        }

        if i == 0:
            new_feature.qualifiers["note"] = [
                "best prediction (most abundant) for anchor gene {}".format(
                    anchor)
            ]
        else:
            new_feature.qualifiers["note"] = [
                "alternative prediction ({}) for anchor gene {}".format(
                    i, anchor)
            ]

        new_feature = SubRegion.from_biopython(new_feature)
        subregions.append(new_feature)
    return subregions
Пример #12
0
    def test_add_to_record(self):
        nisin = helpers.get_path_to_nisin_genbank()
        record = record_processing.parse_input_sequence(nisin)[0]
        assert not record.get_pfam_domains()

        # add a test PFAM
        pfam = PFAMDomain(FeatureLocation(2, 5),
                          description="test",
                          protein_start=5,
                          protein_end=10,
                          identifier="PF00005",
                          domain="PF00005",
                          tool="test")
        pfam.domain_id = "test"
        record.add_pfam_domain(pfam)
        assert len(record.get_pfam_domains()) == 1

        # run pfam2go and add the results
        results = pfam2go.run_on_record(record, None, self.options)
        assert pfam in results.pfam_domains_with_gos

        assert not pfam.gene_ontologies
        results.add_to_record(record)
        assert pfam.gene_ontologies

        # check the contents of the annotation
        for domain in record.get_pfam_domains():
            assert domain.gene_ontologies
            assert sorted(domain.gene_ontologies.ids) == sorted(
                results.get_all_gos(domain))
Пример #13
0
    def test_angstrom(self):
        domain = AntismashDomain(FeatureLocation(1, 2), "test")
        domain.domain_id = "query"
        domain.translation = self.aligns[domain.domain_id].replace("-", "")

        sig = nrps_predictor.get_34_aa_signature(domain)
        assert sig == "L--SFDASLFEMYLLTGGDRNMYGPTEATMCATW"
Пример #14
0
 def test_genbank(self):
     dummy_record = Record(Seq("A" * 100, generic_dna))
     clusters = [
         create_cluster(3, 20, "prodA"),
         create_cluster(25, 41, "prodB")
     ]
     for cluster in clusters:
         dummy_record.add_cluster(cluster)
     subregion = SubRegion(FeatureLocation(35, 71), "test", 0.7)
     dummy_record.add_subregion(subregion)
     supercluster = SuperCluster(SuperCluster.kinds.NEIGHBOURING, clusters)
     dummy_record.add_supercluster(supercluster)
     region = Region(superclusters=[supercluster], subregions=[subregion])
     dummy_record.add_region(region)
     with NamedTemporaryFile(suffix=".gbk") as output:
         region.write_to_genbank(output.name)
         bio = list(seqio.parse(output.name))
     assert len(bio) == 1
     rec = Record.from_biopython(bio[0], taxon="bacteria")
     assert len(rec.get_regions()) == 1
     new = rec.get_region(0)
     assert new.location.start == 3 - region.location.start
     assert new.location.end == 71 - region.location.start
     assert new.products == region.products
     assert new.probabilities == region.probabilities
Пример #15
0
def apply_cluster_rules(record: Record, results_by_id: Dict[str, List[HSP]],
                        rules: List[rule_parser.DetectionRule]
                        ) -> Tuple[Dict[str, Dict[str, Set[str]]],
                                   Dict[str, Set[str]]]:
    """
        Run detection rules over each CDS and classify them if relevant.
        A CDS can satisfy multiple rules. If so, all rules satisfied
        will form part of the type string, separated by '-'.

        The 'other' type has a lower precedence than other rules and a hit with
        the 'other' rule will be ignored if another rule is also satisfied.

        Args:
            record: the record being checked
            results_by_id: A dict of CDS ID to a list of HSP results
            rules: A list of DetectionRule instances

        Returns:
            A tuple of
                a dictionary mapping CDS ID to
                    a dictionary mapping cluster type string to
                        a set of domains used to determine the cluster
                and a dictionary mapping rule name to
                    a set of CDS feature names that matched the rule
    """
    if not results_by_id:
        return {}, {}

    cds_with_hits = sorted(results_by_id, key=lambda gene_id: record.get_cds_by_name(gene_id).location.start)

    cds_domains_by_cluster_type = {}
    cluster_type_hits = defaultdict(set)  # type: Dict[str, Set[str]]
    for cds_name in cds_with_hits:
        feature = record.get_cds_by_name(cds_name)
        feature_start, feature_end = sorted([feature.location.start, feature.location.end])
        results = []  # type: List[str]
        rule_texts = []
        info_by_range = {}  # type: Dict[int, Tuple[Dict[str, CDSFeature], Dict[str, List[HSP]]]]
        domain_matches = set()  # type: Set[str]
        domains_by_cluster = {}  # type: Dict[str, Set[str]]
        for rule in rules:
            if rule.cutoff not in info_by_range:
                location = FeatureLocation(feature_start - rule.cutoff, feature_end + rule.cutoff)
                nearby = record.get_cds_features_within_location(location, with_overlapping=True)
                nearby_features = {neighbour.get_name(): neighbour for neighbour in nearby}
                nearby_results = {neighbour: results_by_id[neighbour]
                                  for neighbour in nearby_features if neighbour in results_by_id}
                info_by_range[rule.cutoff] = (nearby_features, nearby_results)
            nearby_features, nearby_results = info_by_range[rule.cutoff]
            matching = rule.detect(cds_name, nearby_features, nearby_results)
            if matching.met and matching.matches:
                domains_by_cluster[rule.name] = matching.matches
                results.append(rule.name)
                rule_texts.append(rule.reconstruct_rule_text())
                domain_matches.update(matching.matches)
                cluster_type_hits[rule.name].add(cds_name)
        if domains_by_cluster:
            cds_domains_by_cluster_type[cds_name] = domains_by_cluster
    return cds_domains_by_cluster_type, cluster_type_hits
Пример #16
0
 def setUp(self):
     self.cluster = create_cluster()
     self.cluster.core_location = FeatureLocation(30, 50)
     self.inside_cds = DummyCDS(40, 45)
     self.neighbour_cds = DummyCDS(20, 25)
     self.outside_cds = DummyCDS(120, 125)
     assert not self.cluster.cds_children
     assert not self.cluster.definition_cdses
Пример #17
0
 def test_construction(self):
     loc = FeatureLocation(1, 15, 1)
     domain = Domain(loc, "test_type", tool="test")
     assert domain.type == "test_type"
     assert domain.location == loc
     assert domain.created_by_antismash
     assert domain.tool == "test"
     assert domain.domain is None
Пример #18
0
 def test_probabilities(self):
     loc = FeatureLocation(0, 10)
     candidates = [DummyCandidateCluster([create_protocluster(0, 10)])]
     assert Region(candidate_clusters=candidates).probabilities == []
     subs = [SubRegion(loc, "testtool", probability=None)]
     assert Region(candidate_clusters=candidates, subregions=subs).probabilities == []
     subs.append(SubRegion(loc, "testtool", probability=0.1))
     assert Region(candidate_clusters=candidates, subregions=subs).probabilities == [0.1]
     subs.append(SubRegion(loc, "testtool", probability=0.7))
     assert Region(candidate_clusters=candidates, subregions=subs).probabilities == [0.1, 0.7]
Пример #19
0
 def test_core(self):
     protos = [
         create_cluster(5, 10, 20, 25, "a"),
         create_cluster(30, 40, 50, 60, "b")
     ]
     cluster = CandidateCluster(CandidateClusterKind.NEIGHBOURING,
                                protos,
                                smiles="dummy",
                                polymer="dummy")
     assert cluster.core_location == FeatureLocation(10, 50)
Пример #20
0
 def __init__(self, name=None, function="other", components=None, location=None, start=None, strand=1):
     if name is None:
         DummyReferenceCDS.counter += 1
         name = f"test_ref_{DummyReferenceCDS.counter}"
     if components is None:
         components = {"secmet":[], "modules":[]}
     if location is None:
         if start is None:
             start = 20
         location = FeatureLocation(start, start + 20, strand)
     super().__init__(name, function, components, location)
Пример #21
0
    def test_translation(self):
        domain = Domain(FeatureLocation(1, 15, 1),
                        "test_type",
                        tool="test",
                        protein_location=FeatureLocation(0, 3),
                        locus_tag="locus")
        with self.assertRaisesRegex(ValueError, "has no translation"):
            assert domain.translation is None
        domain.translation = "AAA"
        assert domain.translation == "AAA"

        with self.assertRaisesRegex(ValueError, "stop codons"):
            domain.translation = "A*A"

        for value in [7, None, Domain]:
            with self.assertRaises(AssertionError):
                domain.translation = value

        with self.assertRaisesRegex(ValueError, "empty"):
            domain.translation = ""
Пример #22
0
    def test_sideloaded(self):
        clusters = [
            create_protocluster(3, 20, "prodA"),
            SideloadedProtocluster(FeatureLocation(25, 41),
                                   FeatureLocation(25, 41), "external",
                                   "prodB")
        ]
        candidate = CandidateCluster(CandidateCluster.kinds.NEIGHBOURING,
                                     clusters)

        subregions = [
            SubRegion(FeatureLocation(35, 71), "test", 0.7),
            SideloadedSubRegion(FeatureLocation(45, 61), "external")
        ]

        region = Region(candidate_clusters=[candidate], subregions=subregions)
        sideloaded = region.get_sideloaded_areas()
        assert len(sideloaded) == 2
        assert sideloaded[0] is clusters[1]
        assert sideloaded[1] is subregions[1]
Пример #23
0
    def test_creation_mixed(self):
        cluster = create_cluster(3, 8, 71, 76, 'a')
        hybrid_cluster = create_cluster(50, 60, 120, 170, 'b')
        overlap_cluster = create_cluster(80, 90, 130, 180, 'o')
        neighbour_cluster = create_cluster(50, 210, 260, 270, 'a')
        isolated_cluster = create_cluster(450, 500, 550, 600, 'alone')

        # insert the cds that will cause the hybrid call
        cds_ab = create_cds(60, 65, ["a", "b"])
        cluster.add_cds(cds_ab)
        hybrid_cluster.add_cds(cds_ab)

        created = creator([
            cluster, hybrid_cluster, overlap_cluster, neighbour_cluster,
            isolated_cluster
        ])
        print(created)

        assert len(created) == 5
        assert created[0].location == FeatureLocation(3, 270)
        assert created[0].kind == SuperCluster.kinds.NEIGHBOURING
        assert created[0].clusters == (cluster, hybrid_cluster,
                                       overlap_cluster, neighbour_cluster)

        assert created[1].location == FeatureLocation(3, 180)
        assert created[1].kind == SuperCluster.kinds.INTERLEAVED
        assert created[1].clusters == (cluster, hybrid_cluster,
                                       overlap_cluster)

        assert created[2].location == FeatureLocation(3, 170)
        assert created[2].kind == SuperCluster.kinds.CHEMICAL_HYBRID
        assert created[2].clusters == (cluster, hybrid_cluster)

        assert created[3].location == FeatureLocation(50, 270)
        assert created[3].kind == SuperCluster.kinds.SINGLE
        assert created[3].clusters == (neighbour_cluster, )

        assert created[4].location == FeatureLocation(450, 600)
        assert created[4].kind == SuperCluster.kinds.SINGLE
        assert created[4].clusters == (isolated_cluster, )
Пример #24
0
 def test_creation_neighbours(self):
     cluster = create_cluster(3, 8, 71, 76, 'a')
     extra_cluster = create_cluster(50, 100, 120, 170, 'b')
     created = creator([cluster, extra_cluster])
     print(created)
     assert len(created) == 3
     expected_location = FeatureLocation(cluster.location.start,
                                         extra_cluster.location.end)
     assert created[0].kind == SuperCluster.kinds.NEIGHBOURING and created[
         0].location == expected_location
     assert created[1].kind == SuperCluster.kinds.SINGLE and created[
         1].location == cluster.location
     assert created[2].kind == SuperCluster.kinds.SINGLE and created[
         2].location == extra_cluster.location
Пример #25
0
    def test_creation_coreoverlap(self):
        cluster = create_cluster(3, 8, 71, 76, 'a')
        extra_cluster = create_cluster(50, 60, 120, 170, 'b')
        # create a CDS within both clusters that has a product from only one cluster
        cds = create_cds(60, 65, ["a"])
        cluster.add_cds(cds)
        extra_cluster.add_cds(cds)

        created = creator([cluster, extra_cluster])
        print(created)
        assert len(created) == 1
        supercluster = created[0]
        assert supercluster.kind == SuperCluster.kinds.INTERLEAVED
        assert supercluster.location == FeatureLocation(3, 170)
 def test_product(self):
     loc = FeatureLocation(1, 6, strand=1)
     for bad in [
             "-", "-like", "NRPS-", "NRPS PKS", "NRPS/PKS", "NRPS,PKS",
             "NRPS.PKS"
     ]:
         with self.assertRaisesRegex(ValueError,
                                     "invalid protocluster product"):
             Protocluster(loc,
                          loc,
                          tool="test",
                          cutoff=17,
                          neighbourhood_range=5,
                          product=bad,
                          detection_rule="some rule text")
Пример #27
0
    def test_creation_hybrid(self):
        cluster = create_cluster(3, 8, 71, 76, 'a')
        hybrid_cluster = create_cluster(50, 60, 120, 170, 'b')

        # insert the cds that will cause the hybrid call
        cds_ab = create_cds(60, 65, ["a", "b"])
        cluster.add_cds(cds_ab)
        hybrid_cluster.add_cds(cds_ab)

        created = creator([cluster, hybrid_cluster])
        print(created)
        assert len(created) == 1
        supercluster = created[0]
        assert supercluster.kind == SuperCluster.kinds.CHEMICAL_HYBRID
        assert supercluster.location == FeatureLocation(3, 170)
Пример #28
0
 def test_probabilities(self):
     loc = FeatureLocation(0, 10)
     supers = [
         SuperCluster(SuperCluster.kinds.SINGLE, [create_cluster(0, 10)])
     ]
     assert Region(superclusters=supers).probabilities == []
     subs = [SubRegion(loc, "testtool", probability=None)]
     assert Region(superclusters=supers,
                   subregions=subs).probabilities == []
     subs.append(SubRegion(loc, "testtool", probability=0.1))
     assert Region(superclusters=supers,
                   subregions=subs).probabilities == [0.1]
     subs.append(SubRegion(loc, "testtool", probability=0.7))
     assert Region(superclusters=supers,
                   subregions=subs).probabilities == [0.1, 0.7]
Пример #29
0
    def test_limited_add_cds_propagation(self):
        cds = DummyCDS(0, 10)
        self.sub = SubRegion(FeatureLocation(20, 30), "testtool")
        self.region = Region(superclusters=[self.super], subregions=[self.sub])

        # ensure all empty to start with
        assert not self.cluster.cds_children
        assert not self.super.cds_children
        assert not self.sub.cds_children
        assert not self.region.cds_children
        assert not cds.region

        self.region.add_cds(cds)
        assert self.cluster.cds_children == (cds, )
        assert self.super.cds_children == (cds, )
        assert not self.sub.cds_children
        assert self.region.cds_children == (cds, )
        assert cds.region is self.region
Пример #30
0
def find_clusters(record: Record, cds_by_cluster_type: Dict[str, Set[str]],
                  rules_by_name: Dict[str, rule_parser.DetectionRule]) -> List[Cluster]:
    """ Detects gene clusters based on the identified core genes """
    clusters = []  # type: List[Cluster]

    cds_feature_by_name = record.get_cds_name_mapping()

    for cluster_type, cds_names in cds_by_cluster_type.items():
        cds_features = sorted([cds_feature_by_name[cds] for cds in cds_names])
        rule = rules_by_name[cluster_type]
        cutoff = rule.cutoff
        core_location = cds_features[0].location
        for cds in cds_features[1:]:
            if cds.overlaps_with(FeatureLocation(core_location.start - cutoff,
                                                 core_location.end + cutoff)):
                core_location = FeatureLocation(min(cds.location.start, core_location.start),
                                                max(cds.location.end, core_location.end))
                assert core_location.start >= 0 and core_location.end <= len(record)
                continue
            # create the previous cluster and start a new location
            surrounds = FeatureLocation(max(0, core_location.start - rule.extent),
                                        min(core_location.end + rule.extent, len(record)))
            surrounding_cdses = record.get_cds_features_within_location(surrounds, with_overlapping=False)
            real_start = min(contained.location.start for contained in surrounding_cdses)
            real_end = max(contained.location.end for contained in surrounding_cdses)
            surrounds = FeatureLocation(real_start, real_end)
            clusters.append(Cluster(core_location, surrounding_location=surrounds,
                                    tool="rule-based-clusters", cutoff=cutoff,
                                    neighbourhood_range=rule.extent, product=cluster_type,
                                    detection_rule=str(rule.conditions)))
            core_location = cds.location

        # finalise the last cluster
        surrounds = FeatureLocation(max(0, core_location.start - rule.extent),
                                    min(core_location.end + rule.extent, len(record)))
        clusters.append(Cluster(core_location, surrounding_location=surrounds,
                                tool="rule-based-clusters", cutoff=cutoff,
                                neighbourhood_range=rule.extent, product=cluster_type,
                                detection_rule=str(rule.conditions)))

    # fit to record if outside
    for cluster in clusters:
        contained = FeatureLocation(max(0, cluster.location.start),
                                    min(cluster.location.end, len(record)))
        if contained != cluster.location:
            cluster.location = contained

    clusters = remove_redundant_clusters(clusters, rules_by_name)

    logging.debug("%d rule-based cluster(s) found in record", len(clusters))
    return clusters