def test_hsp_overlap_size(self): overlap_size = hmm_detection.hsp_overlap_size first = FakeHSPHit("A", "A", 50, 60, 0., None) second = FakeHSPHit("B", "B", 70, 100, 0., None) # no overlap assert overlap_size(first, second) == 0 first.hit_end = 70 # still no overlap, end isn't inclusive assert overlap_size(first, second) == 0 # a mix of second starting inside first for i in range(1, 30): first.hit_end += 1 assert overlap_size(first, second) == i # second wholly contained first.hit_end = 110 assert overlap_size(first, second) == 30 # first inside second first.hit_start = 75 assert overlap_size(first, second) == 25 # first inside second, but direction reversed first.hit_end = 50 with self.assertRaises(AssertionError): overlap_size(first, second)
def setUp(self): self.feature_by_id = { "GENE_1": DummyCDS(0, 30000, locus_tag="GENE_1"), "GENE_2": DummyCDS(30000, 50000, locus_tag="GENE_2"), "GENE_3": DummyCDS(70000, 90000, locus_tag="GENE_3"), "GENE_X": DummyCDS(95000, 100000, locus_tag="GENE_X"), "GENE_4": DummyCDS(125000, 140000, locus_tag="GENE_4"), "GENE_5": DummyCDS(145000, 150000, locus_tag="GENE_5") } self.features = list(self.feature_by_id.values()) self.features.sort(key=lambda x: x.location.start) # vital for py3 < 3.5 self.record = DummyRecord(self.features) self.results_by_id = { "GENE_1": [FakeHSPHit("a", "GENE_1", 0, 10, 50, 0), FakeHSPHit("b", "GENE_1", 0, 10, 50, 0)], "GENE_2": [FakeHSPHit("a", "GENE_1", 0, 10, 50, 0), FakeHSPHit("c", "GENE_1", 0, 10, 50, 0)], "GENE_3": [FakeHSPHit("b", "GENE_1", 0, 10, 50, 0), FakeHSPHit("c", "GENE_1", 0, 10, 50, 0)], "GENE_4": [FakeHSPHit("e", "GENE_1", 0, 10, 50, 0), FakeHSPHit("f", "GENE_1", 0, 10, 50, 0)], "GENE_5": [FakeHSPHit("f", "GENE_1", 0, 10, 50, 0), FakeHSPHit("g", "GENE_1", 0, 10, 50, 0)]} self.signature_names = set(["a", "b", "c", "d", "e", "f", "g", "modelA", "modelB"])
def test_filter_multiple(self): # all in one CDS no overlap and the same query_ids -> cull all but the best score # not overlapping, not same query_id first = FakeHSPHit("AMP-binding", "A", 50, 60, 0.1, None) second = FakeHSPHit("A-OX", "A", 70, 100, 0.5, None) both = [first, second] by_id = {"A": [first, second]} new, by_id = hmm_detection.filter_result_multiple( list(both), dict(by_id)) assert new == [first, second] assert by_id == {"A": [first, second]} # not overlapping, same query_id first.query_id = "A-OX" new, by_id = hmm_detection.filter_result_multiple( list(both), dict(by_id)) assert new == [second] assert by_id == {"A": [second]} # not in same CDS, same query_id second.hit_id = "B" by_id = {"A": [first], "B": [second]} new, by_id = hmm_detection.filter_result_multiple( list(both), dict(by_id)) assert new == [first, second] assert by_id == {"A": [first], "B": [second]}
def test_filter(self): # fake HSPs all in one CDS with overlap > 20 and query_ids from the same equivalence group # not overlapping by > 20 first = FakeHSPHit("AMP-binding", "A", 50, 90, 0.1, None) second = FakeHSPHit("A-OX", "A", 70, 100, 0.5, None) new, by_id = hmm_detection.filter_results([first, second], {"A": [first, second]}, self.filter_file, self.signature_names) assert new == [first, second] assert by_id == {"A": [first, second]} # overlapping, in same group first.hit_end = 91 assert hmm_detection.hsp_overlap_size(first, second) == 21 new, by_id = hmm_detection.filter_results([first, second], {"A": [first, second]}, self.filter_file, self.signature_names) assert new == [second] assert by_id == {"A": [second]} # overlapping, not in same group second.query_id = "none" new, by_id = hmm_detection.filter_results([first, second], {"A": [first, second]}, self.filter_file, self.signature_names) assert new == [first, second] assert by_id == {"A": [first, second]} # not in the same CDS, but int he same group second.hit_id = "B" second.query_id = "A-OX" new, by_id = hmm_detection.filter_results([first, second], {"A": [first], "B": [second]}, self.filter_file, self.signature_names) assert new == [first, second] assert by_id == {"A": [first], "B": [second]}
def setUp(self): self.config = build_config([]) self.rules_file = path.get_full_path(__file__, "..", "cluster_rules", "strict.txt") self.signature_file = path.get_full_path(__file__, "..", "data", "hmmdetails.txt") self.signature_names = {sig.name for sig in core.get_signature_profiles()} self.valid_categories = {cat.name for cat in core.get_rule_categories()} self.filter_file = path.get_full_path(__file__, "..", "filterhmmdetails.txt") self.results_by_id = { "GENE_1": [ FakeHSPHit("modelA", "GENE_1", 0, 10, 50, 0), FakeHSPHit("modelB", "GENE_1", 0, 10, 50, 0) ], "GENE_2": [ FakeHSPHit("modelC", "GENE_2", 0, 10, 50, 0), FakeHSPHit("modelB", "GENE_2", 0, 10, 50, 0) ], "GENE_3": [ FakeHSPHit("modelC", "GENE_3", 0, 10, 50, 0), FakeHSPHit("modelF", "GENE_3", 0, 10, 50, 0) ], "GENE_4": [ FakeHSPHit("modelA", "GENE_4", 0, 10, 50, 0), FakeHSPHit("modelE", "GENE_4", 0, 10, 50, 0) ], "GENE_5": [ FakeHSPHit("modelA", "GENE_5", 0, 10, 50, 0), FakeHSPHit("modelG", "GENE_5", 0, 10, 50, 0) ] } self.feature_by_id = { "GENE_1": DummyCDS(0, 30000, locus_tag="GENE_1"), "GENE_2": DummyCDS(30000, 50000, locus_tag="GENE_2"), "GENE_3": DummyCDS(70000, 90000, locus_tag="GENE_3"), "GENE_X": DummyCDS(95000, 100000, locus_tag="GENE_X"), # no hits "GENE_4": DummyCDS(125000, 140000, locus_tag="GENE_4"), "GENE_5": DummyCDS(130000, 150000, locus_tag="GENE_5") } self.test_names = {"modelA", "modelB", "modelC", "modelF", "modelG", "a", "b", "c", "d"} self.categories = {"Cat"} self.rules = rule_parser.Parser("\n".join([ "RULE MetaboliteA CATEGORY Cat CUTOFF 10 NEIGHBOURHOOD 5 CONDITIONS modelA", "RULE MetaboliteB CATEGORY Cat CUTOFF 10 NEIGHBOURHOOD 5 CONDITIONS cds(modelA and modelB)", "RULE MetaboliteC CATEGORY Cat CUTOFF 10 NEIGHBOURHOOD 5 CONDITIONS (modelA and modelB)", "RULE MetaboliteD CATEGORY Cat CUTOFF 20 NEIGHBOURHOOD 5 CONDITIONS minimum(2,[modelC,modelB]) and modelA", "RULE Metabolite0 CATEGORY Cat CUTOFF 1 NEIGHBOURHOOD 3 CONDITIONS modelF", "RULE Metabolite1 CATEGORY Cat CUTOFF 1 NEIGHBOURHOOD 3 CONDITIONS modelG"]), self.test_names, self.categories).rules self.features = [] for gene_id in self.feature_by_id: self.features.append(self.feature_by_id[gene_id]) self.features.sort(key=lambda x: x.location.start) # vital for py3 < 3.5 self.record = Record() self.record._record.seq = Seq("A"*150000) for feature in self.features: self.record.add_cds_feature(feature)
def test_chained_and_a(self): # remove the c hit from GENE_2 self.results_by_id["GENE_2"] = [FakeHSPHit("a", "GENE_1", 0, 10, 50, 0)] results = self.run_test("A", 25, 20, "a and b and not c") # GENE_1 contains both # GENE_2 contains a b, but reaches the c in GENE_3 self.expect(results, ["GENE_1"])
def test_single_gene(self): self.results_by_id = { "GENE_1": [ FakeHSPHit("modelA", "GENE_1", 0, 10, 50, 0), FakeHSPHit("modelB", "GENE_1", 0, 10, 50, 0) ] } self.feature_by_id = {"GENE_1": DummyCDS(0, 30000, locus_tag="GENE_1")} results = self.run_test("A", 10, 20, "minimum(2, [modelA,modelB])") self.expect(results, ["GENE_1"]) results = self.run_test("A", 10, 20, "cds(modelA and modelB)") self.expect(results, ["GENE_1"]) results = self.run_test("A", 10, 20, "cds(modelA or modelB)") self.expect(results, ["GENE_1"]) results = self.run_test("A", 10, 20, "modelA and modelB") self.expect(results, ["GENE_1"]) results = self.run_test("A", 10, 20, "modelA or modelB") self.expect(results, ["GENE_1"])