def setUp(self): self.g = Goldilocks(NucleotideCounterStrategy( ["A", "C", "G", "T", "N"]), sequence_data, length=3, stride=1) self.TOTAL_REGIONS = 29
from goldilocks.goldilocks import Goldilocks from goldilocks.strategies import VariantCounterStrategy, GCRatioStrategy, NucleotideCounterStrategy, KMerCounterStrategy #TODO Methods may take a list of locations or may need to actually analyze # a proper genomic sequence """Execute Goldilocks search.""" data = {"ONE": {1: [1,2,5]}} g = Goldilocks(VariantCounterStrategy(), data, is_seq=False, stride=1, length=3) candidates = g._filter("max", actual_distance=1) print candidates ######################################### data = {"ONE": {1: "CCCGGGAGATTT"}} g = Goldilocks(GCRatioStrategy(), data, 3, 1) candidates = g._filter("max", actual_distance=1) print candidates candidates.export_fasta(["ONE"]) ######################################### data = {"ONE": {1: "AAACCCGGGCCCGGGAGAAAAAAA"}} g = Goldilocks(KMerCounterStrategy(["AAA", "CCC"]), data, 6, 1) candidates = g._filter("max", actual_distance=1, track="AAA") print candidates
def setUpClass(cls): cls.sequence_data = { "my_sample": { 2: "NANANANANA", "X": "GATTACAGATTACAN", "one": "CATCANCAT", "three": "..A", }, "my_other_sample": { 2: "GANGANGAN", "X": "GATTACAGATTACAN", "one": "TATANTATA", "three": ".N.", } } cls.g = Goldilocks(NucleotideCounterStrategy(["A", "C", "G", "T", "N"]), cls.sequence_data, length=3, stride=1) cls.GROUPS = ["my_sample", "my_other_sample", "total"] cls.TRACKS = ["A", "C", "G", "T", "N", "default"] cls.EXPECTED_REGIONS = { 2: { "my_sample": { 0: { 'A': 1, 'C': 0, 'T': 0, 'G': 0, 'N': 2, "default": 3 }, 1: { 'A': 2, 'C': 0, 'T': 0, 'G': 0, 'N': 1, "default": 3 }, 2: { 'A': 1, 'C': 0, 'T': 0, 'G': 0, 'N': 2, "default": 3 }, 3: { 'A': 2, 'C': 0, 'T': 0, 'G': 0, 'N': 1, "default": 3 }, 4: { 'A': 1, 'C': 0, 'T': 0, 'G': 0, 'N': 2, "default": 3 }, 5: { 'A': 2, 'C': 0, 'T': 0, 'G': 0, 'N': 1, "default": 3 }, 6: { 'A': 1, 'C': 0, 'T': 0, 'G': 0, 'N': 2, "default": 3 }, 7: { 'A': 2, 'C': 0, 'T': 0, 'G': 0, 'N': 1, "default": 3 }, }, "my_other_sample": { 0: { 'A': 1, 'C': 0, 'T': 0, 'G': 1, 'N': 1, "default": 3 }, 1: { 'A': 1, 'C': 0, 'T': 0, 'G': 1, 'N': 1, "default": 3 }, 2: { 'A': 1, 'C': 0, 'T': 0, 'G': 1, 'N': 1, "default": 3 }, 3: { 'A': 1, 'C': 0, 'T': 0, 'G': 1, 'N': 1, "default": 3 }, 4: { 'A': 1, 'C': 0, 'T': 0, 'G': 1, 'N': 1, "default": 3 }, 5: { 'A': 1, 'C': 0, 'T': 0, 'G': 1, 'N': 1, "default": 3 }, 6: { 'A': 1, 'C': 0, 'T': 0, 'G': 1, 'N': 1, "default": 3 }, 7: { 'A': 1, 'C': 0, 'T': 0, 'G': 0, 'N': 1, "default": 2 }, }, "total": { 0: { 'A': 2, 'C': 0, 'T': 0, 'G': 1, 'N': 3, "default": 6 }, 1: { 'A': 3, 'C': 0, 'T': 0, 'G': 1, 'N': 2, "default": 6 }, 2: { 'A': 2, 'C': 0, 'T': 0, 'G': 1, 'N': 3, "default": 6 }, 3: { 'A': 3, 'C': 0, 'T': 0, 'G': 1, 'N': 2, "default": 6 }, 4: { 'A': 2, 'C': 0, 'T': 0, 'G': 1, 'N': 3, "default": 6 }, 5: { 'A': 3, 'C': 0, 'T': 0, 'G': 1, 'N': 2, "default": 6 }, 6: { 'A': 2, 'C': 0, 'T': 0, 'G': 1, 'N': 3, "default": 6 }, 7: { 'A': 3, 'C': 0, 'T': 0, 'G': 0, 'N': 2, "default": 5 }, }, }, "X": { "my_sample": { 0: { 'A': 1, 'C': 0, 'T': 1, 'G': 1, 'N': 0, "default": 3 }, 1: { 'A': 1, 'C': 0, 'T': 2, 'G': 0, 'N': 0, "default": 3 }, 2: { 'A': 1, 'C': 0, 'T': 2, 'G': 0, 'N': 0, "default": 3 }, 3: { 'A': 1, 'C': 1, 'T': 1, 'G': 0, 'N': 0, "default": 3 }, 4: { 'A': 2, 'C': 1, 'T': 0, 'G': 0, 'N': 0, "default": 3 }, 5: { 'A': 1, 'C': 1, 'T': 0, 'G': 1, 'N': 0, "default": 3 }, 6: { 'A': 2, 'C': 0, 'T': 0, 'G': 1, 'N': 0, "default": 3 }, 7: { 'A': 1, 'C': 0, 'T': 1, 'G': 1, 'N': 0, "default": 3 }, 8: { 'A': 1, 'C': 0, 'T': 2, 'G': 0, 'N': 0, "default": 3 }, 9: { 'A': 1, 'C': 0, 'T': 2, 'G': 0, 'N': 0, "default": 3 }, 10: { 'A': 1, 'C': 1, 'T': 1, 'G': 0, 'N': 0, "default": 3 }, 11: { 'A': 2, 'C': 1, 'T': 0, 'G': 0, 'N': 0, "default": 3 }, 12: { 'A': 1, 'C': 1, 'T': 0, 'G': 0, 'N': 1, "default": 3 }, }, "my_other_sample": { 0: { 'A': 1, 'C': 0, 'T': 1, 'G': 1, 'N': 0, "default": 3 }, 1: { 'A': 1, 'C': 0, 'T': 2, 'G': 0, 'N': 0, "default": 3 }, 2: { 'A': 1, 'C': 0, 'T': 2, 'G': 0, 'N': 0, "default": 3 }, 3: { 'A': 1, 'C': 1, 'T': 1, 'G': 0, 'N': 0, "default": 3 }, 4: { 'A': 2, 'C': 1, 'T': 0, 'G': 0, 'N': 0, "default": 3 }, 5: { 'A': 1, 'C': 1, 'T': 0, 'G': 1, 'N': 0, "default": 3 }, 6: { 'A': 2, 'C': 0, 'T': 0, 'G': 1, 'N': 0, "default": 3 }, 7: { 'A': 1, 'C': 0, 'T': 1, 'G': 1, 'N': 0, "default": 3 }, 8: { 'A': 1, 'C': 0, 'T': 2, 'G': 0, 'N': 0, "default": 3 }, 9: { 'A': 1, 'C': 0, 'T': 2, 'G': 0, 'N': 0, "default": 3 }, 10: { 'A': 1, 'C': 1, 'T': 1, 'G': 0, 'N': 0, "default": 3 }, 11: { 'A': 2, 'C': 1, 'T': 0, 'G': 0, 'N': 0, "default": 3 }, 12: { 'A': 1, 'C': 1, 'T': 0, 'G': 0, 'N': 1, "default": 3 }, }, "total": { 0: { 'A': 2, 'C': 0, 'T': 2, 'G': 2, 'N': 0, "default": 6 }, 1: { 'A': 2, 'C': 0, 'T': 4, 'G': 0, 'N': 0, "default": 6 }, 2: { 'A': 2, 'C': 0, 'T': 4, 'G': 0, 'N': 0, "default": 6 }, 3: { 'A': 2, 'C': 2, 'T': 2, 'G': 0, 'N': 0, "default": 6 }, 4: { 'A': 4, 'C': 2, 'T': 0, 'G': 0, 'N': 0, "default": 6 }, 5: { 'A': 2, 'C': 2, 'T': 0, 'G': 2, 'N': 0, "default": 6 }, 6: { 'A': 4, 'C': 0, 'T': 0, 'G': 2, 'N': 0, "default": 6 }, 7: { 'A': 2, 'C': 0, 'T': 2, 'G': 2, 'N': 0, "default": 6 }, 8: { 'A': 2, 'C': 0, 'T': 4, 'G': 0, 'N': 0, "default": 6 }, 9: { 'A': 2, 'C': 0, 'T': 4, 'G': 0, 'N': 0, "default": 6 }, 10: { 'A': 2, 'C': 2, 'T': 2, 'G': 0, 'N': 0, "default": 6 }, 11: { 'A': 4, 'C': 2, 'T': 0, 'G': 0, 'N': 0, "default": 6 }, 12: { 'A': 2, 'C': 2, 'T': 0, 'G': 0, 'N': 2, "default": 6 }, }, }, "one": { "my_sample": { 0: { 'A': 1, 'C': 1, 'T': 1, 'G': 0, 'N': 0, "default": 3 }, 1: { 'A': 1, 'C': 1, 'T': 1, 'G': 0, 'N': 0, "default": 3 }, 2: { 'A': 1, 'C': 1, 'T': 1, 'G': 0, 'N': 0, "default": 3 }, 3: { 'A': 1, 'C': 1, 'T': 0, 'G': 0, 'N': 1, "default": 3 }, 4: { 'A': 1, 'C': 1, 'T': 0, 'G': 0, 'N': 1, "default": 3 }, 5: { 'A': 1, 'C': 1, 'T': 0, 'G': 0, 'N': 1, "default": 3 }, 6: { 'A': 1, 'C': 1, 'T': 1, 'G': 0, 'N': 0, "default": 3 }, }, "my_other_sample": { 0: { 'A': 1, 'C': 0, 'T': 2, 'G': 0, 'N': 0, "default": 3 }, 1: { 'A': 2, 'C': 0, 'T': 1, 'G': 0, 'N': 0, "default": 3 }, 2: { 'A': 1, 'C': 0, 'T': 1, 'G': 0, 'N': 1, "default": 3 }, 3: { 'A': 1, 'C': 0, 'T': 1, 'G': 0, 'N': 1, "default": 3 }, 4: { 'A': 1, 'C': 0, 'T': 1, 'G': 0, 'N': 1, "default": 3 }, 5: { 'A': 1, 'C': 0, 'T': 2, 'G': 0, 'N': 0, "default": 3 }, 6: { 'A': 2, 'C': 0, 'T': 1, 'G': 0, 'N': 0, "default": 3 }, }, "total": { 0: { 'A': 2, 'C': 1, 'T': 3, 'G': 0, 'N': 0, "default": 6 }, 1: { 'A': 3, 'C': 1, 'T': 2, 'G': 0, 'N': 0, "default": 6 }, 2: { 'A': 2, 'C': 1, 'T': 2, 'G': 0, 'N': 1, "default": 6 }, 3: { 'A': 2, 'C': 1, 'T': 1, 'G': 0, 'N': 2, "default": 6 }, 4: { 'A': 2, 'C': 1, 'T': 1, 'G': 0, 'N': 2, "default": 6 }, 5: { 'A': 2, 'C': 1, 'T': 2, 'G': 0, 'N': 1, "default": 6 }, 6: { 'A': 3, 'C': 1, 'T': 2, 'G': 0, 'N': 0, "default": 6 }, }, }, "three": { "my_sample": { 0: { 'A': 1, 'C': 0, 'T': 0, 'G': 0, 'N': 0, "default": 1 }, }, "my_other_sample": { 0: { 'A': 0, 'C': 0, 'T': 0, 'G': 0, 'N': 1, "default": 1 }, }, "total": { 0: { 'A': 1, 'C': 0, 'T': 0, 'G': 0, 'N': 1, "default": 2 }, } } } # 29 regions * 5 bases * (2+1) samples (two samples + total) cls.EXPECTED_NUM_REGION = 29 cls.EXPECTED_REGION_COUNT = cls.EXPECTED_NUM_REGION * 5 * 3 # Each region gets an additional counter cls.EXPECTED_COUNTERS_COUNT = cls.EXPECTED_REGION_COUNT + cls.EXPECTED_NUM_REGION * 3
def setUpClass(cls): cls.sequence_data = { "my_sample": { 1: "GCGCGCGC..GCGCGC....GCGC......GC", }, "my_other_sample": { 1: "GC......GCGC....GCGCGC..GCGCGCGC", } } cls.g = Goldilocks(GCRatioStrategy(), cls.sequence_data, length=8, stride=8) cls.GROUPS = ["my_sample", "my_other_sample", "total"] cls.TRACKS = ["default"] cls.EXPECTED_REGIONS = { 1: { "my_sample": { 0: { "default": 1.0 }, 1: { "default": 0.75 }, 2: { "default": 0.5 }, 3: { "default": 0.25 }, }, "my_other_sample": { 0: { "default": 0.25 }, 1: { "default": 0.5 }, 2: { "default": 0.75 }, 3: { "default": 1.0 }, }, "total": { 0: { "default": 0.625 }, 1: { "default": 0.625 }, 2: { "default": 0.625 }, 3: { "default": 0.625 }, }, }, } cls.EXPECTED_NUM_REGION = 4 cls.EXPECTED_REGION_COUNT = cls.EXPECTED_NUM_REGION * 3 # Each region gets an additional default counter cls.EXPECTED_COUNTERS_COUNT = cls.EXPECTED_REGION_COUNT
def setUpClass(cls): cls.sequence_data = { "my_sample": { 1: "..N..N..N", 2: "A.A.AA..A", 3: "NNN.NN...", }, "my_other_sample": { 1: "N..NN.NNN", 2: "A..AA....", 3: "AAA.AA...", } } cls.g = Goldilocks(NucleotideCounterStrategy(["A", "N"]), cls.sequence_data, length=3, stride=3) cls.GROUPS = ["my_sample", "my_other_sample", "total"] cls.TRACKS = ["A", "N", "default"] cls.EXPECTED_REGIONS = { 1: { "my_sample": { 0: { 'A': 0, 'N': 1, "default": 1 }, 1: { 'A': 0, 'N': 1, "default": 1 }, 2: { 'A': 0, 'N': 1, "default": 1 }, }, "my_other_sample": { 0: { 'A': 0, 'N': 1, "default": 1 }, 1: { 'A': 0, 'N': 2, "default": 2 }, 2: { 'A': 0, 'N': 3, "default": 3 }, }, "total": { 0: { 'A': 0, 'N': 2, "default": 2 }, 1: { 'A': 0, 'N': 3, "default": 3 }, 2: { 'A': 0, 'N': 4, "default": 4 }, }, }, 2: { "my_sample": { 0: { 'A': 2, 'N': 0, "default": 2 }, 1: { 'A': 2, 'N': 0, "default": 2 }, 2: { 'A': 1, 'N': 0, "default": 1 }, }, "my_other_sample": { 0: { 'A': 1, 'N': 0, "default": 1 }, 1: { 'A': 2, 'N': 0, "default": 2 }, 2: { 'A': 0, 'N': 0, "default": 0 }, }, "total": { 0: { 'A': 3, 'N': 0, "default": 3 }, 1: { 'A': 4, 'N': 0, "default": 4 }, 2: { 'A': 1, 'N': 0, "default": 1 }, }, }, 3: { "my_sample": { 0: { 'A': 0, 'N': 3, "default": 3 }, 1: { 'A': 0, 'N': 2, "default": 2 }, 2: { 'A': 0, 'N': 0, "default": 0 }, }, "my_other_sample": { 0: { 'A': 3, 'N': 0, "default": 3 }, 1: { 'A': 2, 'N': 0, "default": 2 }, 2: { 'A': 0, 'N': 0, "default": 0 }, }, "total": { 0: { 'A': 3, 'N': 3, "default": 6 }, 1: { 'A': 2, 'N': 2, "default": 4 }, 2: { 'A': 0, 'N': 0, "default": 0 }, }, } } # 9 regions * 2 bases * (2+1) samples (two samples + total) cls.EXPECTED_NUM_REGION = 9 cls.EXPECTED_REGION_COUNT = cls.EXPECTED_NUM_REGION * 2 * 3 # Each region gets an additional default counter cls.EXPECTED_COUNTERS_COUNT = cls.EXPECTED_REGION_COUNT + cls.EXPECTED_NUM_REGION * 3
class TestGoldilocks(unittest.TestCase): def setUp(self): self.g = Goldilocks(NucleotideCounterStrategy( ["A", "C", "G", "T", "N"]), sequence_data, length=3, stride=1) self.TOTAL_REGIONS = 29 def __test_simple_exclusions(self, EXCLUSIONS, limit=0): FILTER_TO_PROPERTY = { "start_lte": ("pos_start", "lt"), "start_gte": ("pos_start", "gt"), "end_lte": ("pos_end", "lt"), "end_gte": ("pos_end", "gt"), "chr": ("chr", "nin") } for exclusion_name, exclusion in EXCLUSIONS.items(): for op in OPS: if limit > 0: candidates = self.g._filter(op, exclusions={ exclusion["filter"]: exclusion["value"] }, limit=limit).candidates else: candidates = self.g._filter(op, exclusions={ exclusion["filter"]: exclusion["value"] }).candidates for c in candidates: cproperty = FILTER_TO_PROPERTY[exclusion["filter"]][0] test_type = FILTER_TO_PROPERTY[exclusion["filter"]][1] if test_type == "lt": self.assertTrue(c[cproperty] > exclusion["value"]) elif test_type == "gt": self.assertTrue(c[cproperty] < exclusion["value"]) elif test_type == "nin": self.assertNotIn(c[cproperty], exclusion["value"]) else: self.fail("Incorrect test_type") if len(candidates) == 0 and ("expect_none" not in exclusion): self.fail( "No candidates returned but at least one expected...") if limit: if limit > self.TOTAL_REGIONS: # Don't test if limit is larger than number of regions pass elif "expect_none" not in exclusion: self.assertEqual(limit, len(candidates)) else: self.assertEqual(0, len(candidates)) def test_missing_length(self): self.assertRaises(TypeError, Goldilocks, NucleotideCounterStrategy([]), sequence_data, stride=1) def test_missing_stride(self): self.assertRaises(TypeError, Goldilocks, NucleotideCounterStrategy([]), sequence_data, length=1) def test_invalid_stride(self): self.assertRaises(ValueError, Goldilocks, NucleotideCounterStrategy([]), sequence_data, length=1, stride=0) self.assertRaises(ValueError, Goldilocks, NucleotideCounterStrategy([]), sequence_data, length=1, stride=-1) self.assertRaises(ValueError, Goldilocks, NucleotideCounterStrategy([]), sequence_data, length=1, stride=-1000) def test_invalid_length(self): self.assertRaises(ValueError, Goldilocks, NucleotideCounterStrategy([]), sequence_data, length=0, stride=1) self.assertRaises(ValueError, Goldilocks, NucleotideCounterStrategy([]), sequence_data, length=-1, stride=1) self.assertRaises(ValueError, Goldilocks, NucleotideCounterStrategy([]), sequence_data, length=-1000, stride=1) def test_invalid_filter_distance(self): for op in OPS: self.assertRaises(ValueError, self.g._filter, op, actual_distance=1, percentile_distance=1) def test_invalid_sort_operation(self): for op in OPS: self.assertRaises(TypeError, self.g._filter, "hoot") def test_unimplemented_strategy(self): self.assertRaises(NotImplementedError, Goldilocks, BaseStrategy(), sequence_data, length=1, stride=1) def test_exclude_chr(self): EXCLUSIONS = { "simple_chr_str": { "filter": "chr", "value": ["one"], }, "simple_chr_int": { "filter": "chr", "value": [2], }, "simple_all_chr": { "filter": "chr", "value": ["one", "X", 2, "three"], "expect_none": True }, } self.__test_simple_exclusions(EXCLUSIONS) self.__test_simple_exclusions(EXCLUSIONS, limit=1) self.__test_simple_exclusions(EXCLUSIONS, limit=5) self.__test_simple_exclusions(EXCLUSIONS, limit=100) def test_exclude_start_gte(self): EXCLUSIONS = { "simple_start_gte": { "filter": "start_gte", "value": 5, }, "prevent_start_gte": { "filter": "start_gte", "value": 100, }, "all_start_gte": { "filter": "start_gte", "value": 1, "expect_none": True }, } self.__test_simple_exclusions(EXCLUSIONS) self.__test_simple_exclusions(EXCLUSIONS, limit=1) self.__test_simple_exclusions(EXCLUSIONS, limit=5) self.__test_simple_exclusions(EXCLUSIONS, limit=100) def test_exclude_start_lte(self): EXCLUSIONS = { "simple_start_lte": { "filter": "start_lte", "value": 5, }, "prevent_start_lte": { "filter": "start_lte", "value": 0, }, "all_start_lte": { "filter": "start_lte", "value": 100, "expect_none": True }, } self.__test_simple_exclusions(EXCLUSIONS) self.__test_simple_exclusions(EXCLUSIONS, limit=1) self.__test_simple_exclusions(EXCLUSIONS, limit=5) self.__test_simple_exclusions(EXCLUSIONS, limit=100) def test_exclude_end_gte(self): EXCLUSIONS = { "simple_end_gte": { "filter": "end_gte", "value": 5, }, "prevent_end_gte": { "filter": "end_gte", "value": 100, }, "all_end_gte": { "filter": "end_gte", "value": 1, "expect_none": True }, } self.__test_simple_exclusions(EXCLUSIONS) self.__test_simple_exclusions(EXCLUSIONS, limit=1) self.__test_simple_exclusions(EXCLUSIONS, limit=5) self.__test_simple_exclusions(EXCLUSIONS, limit=100) def test_exclude_end_lte(self): EXCLUSIONS = { "simple_end_lte": { "filter": "end_lte", "value": 5, }, "prevent_end_lte": { "filter": "end_lte", "value": 0, }, "all_end_lte": { "filter": "end_lte", "value": 100, "expect_none": True }, } self.__test_simple_exclusions(EXCLUSIONS) self.__test_simple_exclusions(EXCLUSIONS, limit=1) self.__test_simple_exclusions(EXCLUSIONS, limit=5) self.__test_simple_exclusions(EXCLUSIONS, limit=100) def test_exclude_and(self): for op in OPS: candidates = self.g._filter(op, exclusions={ "start_gte": 5, "end_lte": 9, }, use_and=True).candidates for c in candidates: self.assertFalse(c["pos_start"] >= 5 and c["pos_end"] <= 9) self.assertTrue(len(candidates) < self.TOTAL_REGIONS) def test_exclude_and_with_chr(self): for op in OPS: candidates = self.g._filter(op, exclusions={ "start_gte": 5, "end_lte": 9, "chr": ["X"], }, use_and=True).candidates non_x_count = 0 for c in candidates: if c["chr"] == "X": self.assertFalse(c["pos_start"] >= 5 and c["pos_end"] <= 9) else: if (c["pos_start"] >= 5 and c["pos_end"] <= 9): non_x_count += 1 self.assertTrue(non_x_count > 0) self.assertTrue(len(candidates) < self.TOTAL_REGIONS) def test_exclude_chr_specific_chr(self): pass def test_exclude_chr_specific_start(self): pass def test_exclude_chr_specific_end(self): pass def test_exclude_chr_specific_and(self): pass def test_limit(self): for op in OPS: candidates = self.g.query(op, limit=1).candidates self.assertTrue(len(candidates) == 1) candidates = self.g.query(op, limit=10).candidates self.assertTrue(len(candidates) == 10) candidates = self.g.query(op, limit=100).candidates self.assertTrue(len(candidates) == self.TOTAL_REGIONS) def test_distance_upper(self): pass def test_distance_lower(self): pass def test_distance_around(self): pass
from goldilocks.goldilocks import Goldilocks from goldilocks.strategies import VariantCounterStrategy, GCRatioStrategy, NucleotideCounterStrategy, KMerCounterStrategy #TODO Methods may take a list of locations or may need to actually analyze # a proper genomic sequence """Execute Goldilocks search.""" data = {"ONE": {1: [1, 2, 5]}} g = Goldilocks(VariantCounterStrategy(), data, is_seq=False, stride=1, length=3) candidates = g._filter("max", actual_distance=1) print candidates ######################################### data = {"ONE": {1: "CCCGGGAGATTT"}} g = Goldilocks(GCRatioStrategy(), data, 3, 1) candidates = g._filter("max", actual_distance=1) print candidates candidates.export_fasta(["ONE"]) ######################################### data = {"ONE": {1: "AAACCCGGGCCCGGGAGAAAAAAA"}} g = Goldilocks(KMerCounterStrategy(["AAA", "CCC"]), data, 6, 1)
def setUp(self): self.g = Goldilocks(NucleotideCounterStrategy(["A","C","G","T","N"]), sequence_data, length=3, stride=1) self.TOTAL_REGIONS = 29
class TestGoldilocks(unittest.TestCase): def setUp(self): self.g = Goldilocks(NucleotideCounterStrategy(["A","C","G","T","N"]), sequence_data, length=3, stride=1) self.TOTAL_REGIONS = 29 def __test_simple_exclusions(self, EXCLUSIONS, limit=0): FILTER_TO_PROPERTY = { "start_lte": ("pos_start", "lt"), "start_gte": ("pos_start", "gt"), "end_lte": ("pos_end", "lt"), "end_gte": ("pos_end", "gt"), "chr": ("chr", "nin") } for exclusion_name, exclusion in EXCLUSIONS.items(): for op in OPS: if limit > 0: candidates = self.g._filter(op, exclusions={ exclusion["filter"]: exclusion["value"] }, limit=limit).candidates else: candidates = self.g._filter(op, exclusions={ exclusion["filter"]: exclusion["value"] }).candidates for c in candidates: cproperty = FILTER_TO_PROPERTY[exclusion["filter"]][0] test_type = FILTER_TO_PROPERTY[exclusion["filter"]][1] if test_type == "lt": self.assertTrue(c[cproperty] > exclusion["value"]) elif test_type == "gt": self.assertTrue(c[cproperty] < exclusion["value"]) elif test_type == "nin": self.assertNotIn(c[cproperty], exclusion["value"]) else: self.fail("Incorrect test_type") if len(candidates) == 0 and ("expect_none" not in exclusion): self.fail("No candidates returned but at least one expected...") if limit: if limit > self.TOTAL_REGIONS: # Don't test if limit is larger than number of regions pass elif "expect_none" not in exclusion: self.assertEqual(limit, len(candidates)) else: self.assertEqual(0, len(candidates)) def test_missing_length(self): self.assertRaises(TypeError, Goldilocks, NucleotideCounterStrategy([]), sequence_data, stride=1) def test_missing_stride(self): self.assertRaises(TypeError, Goldilocks, NucleotideCounterStrategy([]), sequence_data, length=1) def test_invalid_stride(self): self.assertRaises(ValueError, Goldilocks, NucleotideCounterStrategy([]), sequence_data, length=1, stride=0) self.assertRaises(ValueError, Goldilocks, NucleotideCounterStrategy([]), sequence_data, length=1, stride=-1) self.assertRaises(ValueError, Goldilocks, NucleotideCounterStrategy([]), sequence_data, length=1, stride=-1000) def test_invalid_length(self): self.assertRaises(ValueError, Goldilocks, NucleotideCounterStrategy([]), sequence_data, length=0, stride=1) self.assertRaises(ValueError, Goldilocks, NucleotideCounterStrategy([]), sequence_data, length=-1, stride=1) self.assertRaises(ValueError, Goldilocks, NucleotideCounterStrategy([]), sequence_data, length=-1000, stride=1) def test_invalid_filter_distance(self): for op in OPS: self.assertRaises(ValueError, self.g._filter, op, actual_distance=1, percentile_distance=1) def test_invalid_sort_operation(self): for op in OPS: self.assertRaises(TypeError, self.g._filter, "hoot") def test_unimplemented_strategy(self): self.assertRaises(NotImplementedError, Goldilocks, BaseStrategy(), sequence_data, length=1, stride=1) def test_exclude_chr(self): EXCLUSIONS = { "simple_chr_str": { "filter": "chr", "value": ["one"], }, "simple_chr_int": { "filter": "chr", "value": [2], }, "simple_all_chr": { "filter": "chr", "value": ["one", "X", 2, "three"], "expect_none": True }, } self.__test_simple_exclusions(EXCLUSIONS) self.__test_simple_exclusions(EXCLUSIONS, limit=1) self.__test_simple_exclusions(EXCLUSIONS, limit=5) self.__test_simple_exclusions(EXCLUSIONS, limit=100) def test_exclude_start_gte(self): EXCLUSIONS = { "simple_start_gte": { "filter": "start_gte", "value": 5, }, "prevent_start_gte": { "filter": "start_gte", "value": 100, }, "all_start_gte": { "filter": "start_gte", "value": 1, "expect_none": True }, } self.__test_simple_exclusions(EXCLUSIONS) self.__test_simple_exclusions(EXCLUSIONS, limit=1) self.__test_simple_exclusions(EXCLUSIONS, limit=5) self.__test_simple_exclusions(EXCLUSIONS, limit=100) def test_exclude_start_lte(self): EXCLUSIONS = { "simple_start_lte": { "filter": "start_lte", "value": 5, }, "prevent_start_lte": { "filter": "start_lte", "value": 0, }, "all_start_lte": { "filter": "start_lte", "value": 100, "expect_none": True }, } self.__test_simple_exclusions(EXCLUSIONS) self.__test_simple_exclusions(EXCLUSIONS, limit=1) self.__test_simple_exclusions(EXCLUSIONS, limit=5) self.__test_simple_exclusions(EXCLUSIONS, limit=100) def test_exclude_end_gte(self): EXCLUSIONS = { "simple_end_gte": { "filter": "end_gte", "value": 5, }, "prevent_end_gte": { "filter": "end_gte", "value": 100, }, "all_end_gte": { "filter": "end_gte", "value": 1, "expect_none": True }, } self.__test_simple_exclusions(EXCLUSIONS) self.__test_simple_exclusions(EXCLUSIONS, limit=1) self.__test_simple_exclusions(EXCLUSIONS, limit=5) self.__test_simple_exclusions(EXCLUSIONS, limit=100) def test_exclude_end_lte(self): EXCLUSIONS = { "simple_end_lte": { "filter": "end_lte", "value": 5, }, "prevent_end_lte": { "filter": "end_lte", "value": 0, }, "all_end_lte": { "filter": "end_lte", "value": 100, "expect_none": True }, } self.__test_simple_exclusions(EXCLUSIONS) self.__test_simple_exclusions(EXCLUSIONS, limit=1) self.__test_simple_exclusions(EXCLUSIONS, limit=5) self.__test_simple_exclusions(EXCLUSIONS, limit=100) def test_exclude_and(self): for op in OPS: candidates = self.g._filter(op, exclusions={ "start_gte": 5, "end_lte": 9, }, use_and=True).candidates for c in candidates: self.assertFalse(c["pos_start"] >= 5 and c["pos_end"] <= 9) self.assertTrue(len(candidates) < self.TOTAL_REGIONS) def test_exclude_and_with_chr(self): for op in OPS: candidates = self.g._filter(op, exclusions={ "start_gte": 5, "end_lte": 9, "chr": ["X"], }, use_and=True).candidates non_x_count = 0 for c in candidates: if c["chr"] == "X": self.assertFalse(c["pos_start"] >= 5 and c["pos_end"] <= 9) else: if (c["pos_start"] >= 5 and c["pos_end"] <= 9): non_x_count += 1 self.assertTrue(non_x_count > 0) self.assertTrue(len(candidates) < self.TOTAL_REGIONS) def test_exclude_chr_specific_chr(self): pass def test_exclude_chr_specific_start(self): pass def test_exclude_chr_specific_end(self): pass def test_exclude_chr_specific_and(self): pass def test_limit(self): for op in OPS: candidates = self.g.query(op, limit=1).candidates self.assertTrue(len(candidates) == 1) candidates = self.g.query(op, limit=10).candidates self.assertTrue(len(candidates) == 10) candidates = self.g.query(op, limit=100).candidates self.assertTrue(len(candidates) == self.TOTAL_REGIONS) def test_distance_upper(self): pass def test_distance_lower(self): pass def test_distance_around(self): pass