def test_get_chromosome_1_loop_through_size_20_lower_pval(self): start = 0 size = 20 pval_interval = FloatInterval().set_tuple(0.00001, 0.00001) looped_through = 1 d = utils.create_dictionary_of_empty_dsets(TO_QUERY_DSETS) datasets, index_marker = self.searcher.search_chromosome( chromosome=1, start=start, size=size, pval_interval=pval_interval) d = utils.extend_dsets_with_subset(d, datasets) while len(datasets[REFERENCE_DSET]) > 0: assert_studies_in_list(datasets, ['s1', 's3']) assert_datasets_have_size(datasets, TO_QUERY_DSETS, 20) start = start + index_marker datasets, index_marker = self.searcher.search_chromosome( chromosome=1, start=start, size=size, pval_interval=pval_interval) d = utils.extend_dsets_with_subset(d, datasets) looped_through += 1 assert looped_through == 5 # start changes on each loop! assert index_marker == 0 # 80 unique variants assert len(set(d[SNP_DSET])) == len(d[SNP_DSET]) == 80
def test_loop_through_t2_size_42(self): start = 0 size = 42 looped_through = 1 d = utils.create_dictionary_of_empty_dsets(TO_QUERY_DSETS) while True: datasets, index_marker = self.searcher.search_trait(trait='t2', start=start, size=size) d = utils.extend_dsets_with_subset(d, datasets) if len(datasets[REFERENCE_DSET]) <= 0: break if looped_through <= 1: assert_number_of_times_study_is_in_datasets(datasets, 's3', 42) assert_studies_from_list(datasets, ['s3']) elif looped_through == 2: assert_number_of_times_study_is_in_datasets(datasets, 's3', 8) assert_number_of_times_study_is_in_datasets(datasets, 's4', 34) assert_studies_from_list(datasets, ['s3', 's4']) else: assert_number_of_times_study_is_in_datasets(datasets, 's4', 16) assert_studies_from_list(datasets, ['s4']) looped_through += 1 start = start + index_marker assert looped_through == 4 assert len(set(d[SNP_DSET])) == len(d[SNP_DSET])
def test_get_chromosome_1_loop_through_size_20_s3(self): start = 0 size = 20 looped_through = 1 d = utils.create_dictionary_of_empty_dsets(TO_QUERY_DSETS) datasets, index_marker = self.searcher.search_chromosome(chromosome=1, start=start, size=size, study='s3') d = utils.extend_dsets_with_subset(d, datasets) while len(datasets[REFERENCE_DSET]) > 0: assert_studies_from_list(datasets, ['s3']) if looped_through <= 2: assert_datasets_have_size(datasets, TO_QUERY_DSETS, 20) else: assert_datasets_have_size(datasets, TO_QUERY_DSETS, 10) start = start + index_marker datasets, index_marker = self.searcher.search_chromosome( chromosome=1, start=start, size=size, study='s3') d = utils.extend_dsets_with_subset(d, datasets) looped_through += 1 assert looped_through == 4 assert index_marker == 0 # 50 unique variants assert len(set(d[SNP_DSET])) == len(d[SNP_DSET]) == 50
def __init__(self, snp, start, size, config_properties=None, chromosome=None): self.snp = snp self.chromosome = chromosome self.start = start self.size = size self.properties = properties_handler.get_properties(config_properties) self.search_path = properties_handler.get_search_path(self.properties) self.chr_dir = self.properties.chr_dir self.snp_dir = self.properties.snp_dir self.bp_step = self.properties.bp_step self.datasets = utils.create_dictionary_of_empty_dsets(TO_QUERY_DSETS) self.index_marker = 0 if chromosome is None: self.service = self._calculate_snp_service() else: self.service = self._get_snp_service()
def test_loop_through_w_restrinction_and_always_get_size_20_results(self): start = 0 size = 20 looped_through = 1 # s2 and s3 p-value limits pval_interval = FloatInterval().set_tuple(0.0002, 0.06) d = utils.create_dictionary_of_empty_dsets(TO_QUERY_DSETS) while True: print("start", start) datasets, next_index = self.searcher.search_all_assocs( start=start, size=size, pval_interval=pval_interval) print("next index", next_index) print(datasets[STUDY_DSET]) d = utils.extend_dsets_with_subset(d, datasets) if len(datasets[REFERENCE_DSET]) <= 0: break if looped_through <= 2: assert_studies_from_list(datasets, ['s2']) assert_datasets_have_size(datasets, TO_QUERY_DSETS, 20) elif looped_through == 3: assert_studies_from_list(datasets, ['s2', 's3']) assert_number_of_times_study_is_in_datasets(datasets, 's2', 10) assert_number_of_times_study_is_in_datasets(datasets, 's3', 10) else: assert_studies_from_list(datasets, ['s3']) assert_datasets_have_size(datasets, TO_QUERY_DSETS, 20) looped_through += 1 start = start + next_index assert len(set(d[SNP_DSET])) == len(d[SNP_DSET]) == 100
def test_loop_through_t2_size_5_w_restriction_to_s4(self): start = 0 size = 5 looped_through = 1 pval_interval = FloatInterval().set_tuple(0.06, 0.3) d = utils.create_dictionary_of_empty_dsets(TO_QUERY_DSETS) while True: datasets, index_marker = self.searcher.search_trait( trait='t2', start=start, size=size, pval_interval=pval_interval) d = utils.extend_dsets_with_subset(d, datasets) if len(datasets[REFERENCE_DSET]) <= 0: break # already on the first loop I want to have reached s4 if looped_through == 1: # all of s3 + the first 5 elements of s4 assert index_marker == 55 assert_number_of_times_study_is_in_datasets(datasets, 's4', 5) assert_studies_from_list(datasets, ['s4']) looped_through += 1 start = start + index_marker assert looped_through == 11 assert len(set(d[SNP_DSET])) == len(d[SNP_DSET])
def test_get_chr_1_second_range_loop_20_upper_pval(self): start = 0 size = 20 # index 25-40 for first non-empty block: 48500000 # index 40-50 for second non-empty block: 49200000 bp_interval = IntInterval().set_string_tuple("1200001:49200000") # index 20-35 pval_interval = FloatInterval().set_tuple(0.1, 0.1) looped_through = 1 d = utils.create_dictionary_of_empty_dsets(TO_QUERY_DSETS) datasets, index_marker = self.searcher.search_chromosome(chromosome=1, start=start, size=size, bp_interval=bp_interval, pval_interval=pval_interval) d = utils.extend_dsets_with_subset(d, datasets) while len(datasets[REFERENCE_DSET]) > 0: assert_studies_in_list(datasets, ['s1','s3']) assert_datasets_have_size(datasets, TO_QUERY_DSETS, 20) start = start + index_marker datasets, index_marker = self.searcher.search_chromosome(chromosome=1, start=start, size=size, bp_interval=bp_interval, pval_interval=pval_interval) assert_datasets_have_size(datasets, TO_QUERY_DSETS, 0) d = utils.extend_dsets_with_subset(d, datasets) looped_through += 1 assert looped_through == 2 assert len(set(d[SNP_DSET])) == len(d[SNP_DSET]) == 20
def test_get_snp_loop_through_filter_lower_pval(self): start = 0 size = 2 pval_interval = FloatInterval().set_tuple(0.01, 0.01) d = utils.create_dictionary_of_empty_dsets(TO_QUERY_DSETS) datasets, index_marker = self.searcher.search_snp( snp='rs138808727', start=start, size=size, pval_interval=pval_interval) d = utils.extend_dsets_with_subset(d, datasets) assert_datasets_have_size(datasets, TO_QUERY_DSETS, 2) assert_studies_from_list(datasets, ['s1', 's3']) assert index_marker == 6 start = start + index_marker datasets, index_marker = self.searcher.search_snp( snp='rs138808727', start=start, size=size, pval_interval=pval_interval) assert_datasets_have_size(datasets, TO_QUERY_DSETS, 1) assert_studies_from_list(datasets, ['s5']) d = utils.extend_dsets_with_subset(d, datasets) assert len(d[REFERENCE_DSET]) == 3
def test_create_dictionary_of_empty_dsets(self): datasets = utils.create_dictionary_of_empty_dsets(['dset1', 'dset2']) assert len(datasets) == 2 assert isinstance(datasets['dset1'], Dataset) assert len(datasets['dset1']) == 0 assert isinstance(datasets['dset2'], Dataset) assert len(datasets['dset2']) == 0
def __init__(self, start, size, config_properties=None): self.starting_point = start self.start = start self.size = size self.properties = properties_handler.get_properties(config_properties) self.search_path = properties_handler.get_search_path(self.properties) self.trait_dir = self.properties.trait_dir self.datasets = utils.create_dictionary_of_empty_dsets(TO_QUERY_DSETS) # index marker will be returned along with the datasets # it is the number that when added to the 'start' value that we started the query with # will pinpoint where the next search needs to continue from self.index_marker = self.search_traversed = 0
def test_get_all_loop_through_size_20(self): start = 0 size = 20 d = utils.create_dictionary_of_empty_dsets(TO_QUERY_DSETS) datasets, index_marker = self.searcher.search_all_assocs(start=start, size=size) d = utils.extend_dsets_with_subset(d, datasets) while len(datasets[REFERENCE_DSET]) > 0: if start + index_marker >= 240: assert_datasets_have_size(datasets, TO_QUERY_DSETS, 10) else: assert_datasets_have_size(datasets, TO_QUERY_DSETS, 20) start = start + index_marker datasets, index_marker = self.searcher.search_all_assocs(start=start, size=size) d = utils.extend_dsets_with_subset(d, datasets) assert len(set(d[SNP_DSET])) == len(d[SNP_DSET])
def __init__(self, chromosome, start, size, config_properties=None): self.chromosome = chromosome self.start = start self.size = size self.properties = properties_handler.get_properties(config_properties) self.search_path = properties_handler.get_search_path(self.properties) self.chr_dir = self.properties.chr_dir self.datasets = utils.create_dictionary_of_empty_dsets(TO_QUERY_DSETS) self.index_marker = 0 self.h5file = fsutils.create_h5file_path(path=self.search_path, dir_name=self.chr_dir, file_name=chromosome) if not os.path.isfile(self.h5file): raise NotFoundError("Chromosome " + str(chromosome)) self.service = chromosome_service.ChromosomeService(self.h5file)
def test_get_snp_loop_through_size_1(self): start = 0 size = 1 d = utils.create_dictionary_of_empty_dsets(TO_QUERY_DSETS) datasets, index_marker = self.searcher.search_snp(snp='rs138808727', start=start, size=size) d = utils.extend_dsets_with_subset(d, datasets) while len(datasets[REFERENCE_DSET]) > 0: assert_datasets_have_size(datasets, TO_QUERY_DSETS, 1) start = start + index_marker datasets, index_marker = self.searcher.search_snp( snp='rs138808727', start=start, size=size) d = utils.extend_dsets_with_subset(d, datasets) assert len(d[REFERENCE_DSET]) == 5
def test_get_chr_1_second_range_loop_5(self): start = 0 size = 5 bp_interval = IntInterval().set_string_tuple("1200001:49200000") looped_through = 1 d = utils.create_dictionary_of_empty_dsets(TO_QUERY_DSETS) datasets, index_marker = self.searcher.search_chromosome(chromosome=1, start=start, size=size, bp_interval=bp_interval) d = utils.extend_dsets_with_subset(d, datasets) while len(datasets[REFERENCE_DSET]) > 0: assert_studies_in_list(datasets, ['s1', 's3']) assert_datasets_have_size(datasets, TO_QUERY_DSETS, 5) start = start + index_marker datasets, index_marker = self.searcher.search_chromosome(chromosome=1, start=start, size=size, bp_interval=bp_interval) d = utils.extend_dsets_with_subset(d, datasets) looped_through += 1 assert looped_through == 11 assert len(set(d[SNP_DSET])) == len(d[SNP_DSET])
def __init__(self, trait, start, size, config_properties=None): self.trait = trait self.start = start self.size = size self.properties = properties_handler.get_properties(config_properties) self.search_path = properties_handler.get_search_path(self.properties) self.trait_dir = self.properties.trait_dir self.datasets = utils.create_dictionary_of_empty_dsets(TO_QUERY_DSETS) # index marker will be returned along with the datasets # it is the number that when added to the 'start' value that we started the query with # will pinpoint where the next search needs to continue from self.index_marker = 0 self.h5file = fsutils.create_h5file_path(self.search_path, dir_name=self.trait_dir, file_name=trait) if not os.path.isfile(self.h5file): raise NotFoundError("Trait " + trait) self.service = study_service.StudyService(self.h5file)
def test_get_dsets_group(self): chr_group_2 = gu.Group(self.f.get("/2")) bp_interval = IntInterval().set_tuple(48500000, 48500000) block = bk.Block(bp_interval) block_groups = block.get_block_groups_from_parent(chr_group_2) block_group = next(block_groups) block_sub_groups = block_group.get_all_subgroups() d = du.create_dictionary_of_empty_dsets(TO_QUERY_DSETS) for block_sub_group in block_sub_groups: datasets = query.get_dsets_from_group(block_sub_group, self.start, self.size) assert len(datasets) == len(TO_STORE_DSETS) d = du.extend_dsets_with_subset(d, datasets) for dset_name, dset in d.items(): if dset_name is STUDY_DSET: assert len(set(dset)) == 3 else: assert len(set(dset)) == 1