def test_get_chromosome_1_loop_through_size_20_s3(self): start = 0 size = 20 looped_through = 1 d = utils.create_dictionary_of_empty_dsets(TO_QUERY_DSETS) datasets, index_marker = self.searcher.search_chromosome(chromosome=1, start=start, size=size, study='s3') d = utils.extend_dsets_with_subset(d, datasets) while len(datasets[REFERENCE_DSET]) > 0: assert_studies_from_list(datasets, ['s3']) if looped_through <= 2: assert_datasets_have_size(datasets, TO_QUERY_DSETS, 20) else: assert_datasets_have_size(datasets, TO_QUERY_DSETS, 10) start = start + index_marker datasets, index_marker = self.searcher.search_chromosome( chromosome=1, start=start, size=size, study='s3') d = utils.extend_dsets_with_subset(d, datasets) looped_through += 1 assert looped_through == 4 assert index_marker == 0 # 50 unique variants assert len(set(d[SNP_DSET])) == len(d[SNP_DSET]) == 50
def test_get_chromosome_1_loop_through_size_20_lower_pval(self): start = 0 size = 20 pval_interval = FloatInterval().set_tuple(0.00001, 0.00001) looped_through = 1 d = utils.create_dictionary_of_empty_dsets(TO_QUERY_DSETS) datasets, index_marker = self.searcher.search_chromosome( chromosome=1, start=start, size=size, pval_interval=pval_interval) d = utils.extend_dsets_with_subset(d, datasets) while len(datasets[REFERENCE_DSET]) > 0: assert_studies_in_list(datasets, ['s1', 's3']) assert_datasets_have_size(datasets, TO_QUERY_DSETS, 20) start = start + index_marker datasets, index_marker = self.searcher.search_chromosome( chromosome=1, start=start, size=size, pval_interval=pval_interval) d = utils.extend_dsets_with_subset(d, datasets) looped_through += 1 assert looped_through == 5 # start changes on each loop! assert index_marker == 0 # 80 unique variants assert len(set(d[SNP_DSET])) == len(d[SNP_DSET]) == 80
def test_get_chr_1_second_range_loop_20_upper_pval(self): start = 0 size = 20 # index 25-40 for first non-empty block: 48500000 # index 40-50 for second non-empty block: 49200000 bp_interval = IntInterval().set_string_tuple("1200001:49200000") # index 20-35 pval_interval = FloatInterval().set_tuple(0.1, 0.1) looped_through = 1 d = utils.create_dictionary_of_empty_dsets(TO_QUERY_DSETS) datasets, index_marker = self.searcher.search_chromosome(chromosome=1, start=start, size=size, bp_interval=bp_interval, pval_interval=pval_interval) d = utils.extend_dsets_with_subset(d, datasets) while len(datasets[REFERENCE_DSET]) > 0: assert_studies_in_list(datasets, ['s1','s3']) assert_datasets_have_size(datasets, TO_QUERY_DSETS, 20) start = start + index_marker datasets, index_marker = self.searcher.search_chromosome(chromosome=1, start=start, size=size, bp_interval=bp_interval, pval_interval=pval_interval) assert_datasets_have_size(datasets, TO_QUERY_DSETS, 0) d = utils.extend_dsets_with_subset(d, datasets) looped_through += 1 assert looped_through == 2 assert len(set(d[SNP_DSET])) == len(d[SNP_DSET]) == 20
def test_get_snp_loop_through_filter_lower_pval(self): start = 0 size = 2 pval_interval = FloatInterval().set_tuple(0.01, 0.01) d = utils.create_dictionary_of_empty_dsets(TO_QUERY_DSETS) datasets, index_marker = self.searcher.search_snp( snp='rs138808727', start=start, size=size, pval_interval=pval_interval) d = utils.extend_dsets_with_subset(d, datasets) assert_datasets_have_size(datasets, TO_QUERY_DSETS, 2) assert_studies_from_list(datasets, ['s1', 's3']) assert index_marker == 6 start = start + index_marker datasets, index_marker = self.searcher.search_snp( snp='rs138808727', start=start, size=size, pval_interval=pval_interval) assert_datasets_have_size(datasets, TO_QUERY_DSETS, 1) assert_studies_from_list(datasets, ['s5']) d = utils.extend_dsets_with_subset(d, datasets) assert len(d[REFERENCE_DSET]) == 3
def test_loop_through_w_restrinction_and_always_get_size_20_results(self): start = 0 size = 20 looped_through = 1 # s2 and s3 p-value limits pval_interval = FloatInterval().set_tuple(0.0002, 0.06) d = utils.create_dictionary_of_empty_dsets(TO_QUERY_DSETS) while True: print("start", start) datasets, next_index = self.searcher.search_all_assocs( start=start, size=size, pval_interval=pval_interval) print("next index", next_index) print(datasets[STUDY_DSET]) d = utils.extend_dsets_with_subset(d, datasets) if len(datasets[REFERENCE_DSET]) <= 0: break if looped_through <= 2: assert_studies_from_list(datasets, ['s2']) assert_datasets_have_size(datasets, TO_QUERY_DSETS, 20) elif looped_through == 3: assert_studies_from_list(datasets, ['s2', 's3']) assert_number_of_times_study_is_in_datasets(datasets, 's2', 10) assert_number_of_times_study_is_in_datasets(datasets, 's3', 10) else: assert_studies_from_list(datasets, ['s3']) assert_datasets_have_size(datasets, TO_QUERY_DSETS, 20) looped_through += 1 start = start + next_index assert len(set(d[SNP_DSET])) == len(d[SNP_DSET]) == 100
def test_loop_through_t2_size_42(self): start = 0 size = 42 looped_through = 1 d = utils.create_dictionary_of_empty_dsets(TO_QUERY_DSETS) while True: datasets, index_marker = self.searcher.search_trait(trait='t2', start=start, size=size) d = utils.extend_dsets_with_subset(d, datasets) if len(datasets[REFERENCE_DSET]) <= 0: break if looped_through <= 1: assert_number_of_times_study_is_in_datasets(datasets, 's3', 42) assert_studies_from_list(datasets, ['s3']) elif looped_through == 2: assert_number_of_times_study_is_in_datasets(datasets, 's3', 8) assert_number_of_times_study_is_in_datasets(datasets, 's4', 34) assert_studies_from_list(datasets, ['s3', 's4']) else: assert_number_of_times_study_is_in_datasets(datasets, 's4', 16) assert_studies_from_list(datasets, ['s4']) looped_through += 1 start = start + index_marker assert looped_through == 4 assert len(set(d[SNP_DSET])) == len(d[SNP_DSET])
def test_loop_through_t2_size_5_w_restriction_to_s4(self): start = 0 size = 5 looped_through = 1 pval_interval = FloatInterval().set_tuple(0.06, 0.3) d = utils.create_dictionary_of_empty_dsets(TO_QUERY_DSETS) while True: datasets, index_marker = self.searcher.search_trait( trait='t2', start=start, size=size, pval_interval=pval_interval) d = utils.extend_dsets_with_subset(d, datasets) if len(datasets[REFERENCE_DSET]) <= 0: break # already on the first loop I want to have reached s4 if looped_through == 1: # all of s3 + the first 5 elements of s4 assert index_marker == 55 assert_number_of_times_study_is_in_datasets(datasets, 's4', 5) assert_studies_from_list(datasets, ['s4']) looped_through += 1 start = start + index_marker assert looped_through == 11 assert len(set(d[SNP_DSET])) == len(d[SNP_DSET])
def test_get_all_loop_through_size_20(self): start = 0 size = 20 d = utils.create_dictionary_of_empty_dsets(TO_QUERY_DSETS) datasets, index_marker = self.searcher.search_all_assocs(start=start, size=size) d = utils.extend_dsets_with_subset(d, datasets) while len(datasets[REFERENCE_DSET]) > 0: if start + index_marker >= 240: assert_datasets_have_size(datasets, TO_QUERY_DSETS, 10) else: assert_datasets_have_size(datasets, TO_QUERY_DSETS, 20) start = start + index_marker datasets, index_marker = self.searcher.search_all_assocs(start=start, size=size) d = utils.extend_dsets_with_subset(d, datasets) assert len(set(d[SNP_DSET])) == len(d[SNP_DSET])
def test_get_snp_loop_through_size_1(self): start = 0 size = 1 d = utils.create_dictionary_of_empty_dsets(TO_QUERY_DSETS) datasets, index_marker = self.searcher.search_snp(snp='rs138808727', start=start, size=size) d = utils.extend_dsets_with_subset(d, datasets) while len(datasets[REFERENCE_DSET]) > 0: assert_datasets_have_size(datasets, TO_QUERY_DSETS, 1) start = start + index_marker datasets, index_marker = self.searcher.search_snp( snp='rs138808727', start=start, size=size) d = utils.extend_dsets_with_subset(d, datasets) assert len(d[REFERENCE_DSET]) == 5
def general_search(search_obj, max_size, arguments, restriction_dictionary=None): """ :param search_obj: an object that has a 'query' method and that will perform the actual query :param max_size: the max size of the datasets that we are traversing/querying :param arguments: the arguments to be passed to the query :param restriction_dictionary: a dictonary of restriction objects (see sumstats.utils.restrictions) that will be applied to the datasets returned by the query :return: a tuple (datasets, index_marker) where 'datasets' is a dictionary with the names of the datasets and the data to be returned (the result of the query after applying restrictions) and index_marker is an integer indicating up to where the query went in the dataset so that the next query can calculate it's next start base on the index_marker. The index marker is needed as we are applying filtering (restrictions) to the data and the start/end size used in a query might not be the real indicators of up-till where we have been in the dataset. """ iteration_size = search_obj.size search_id = str(search_obj.__class__.__name__) + str(arguments) + str(restriction_dictionary) logger.info("Searching with search id %s starting...", search_id) logger.debug("Search %s - max size is %s", search_id, max_size) while True: logger.debug("Search %s - loop with start %s and size %s", search_id, str(search_obj.start), str(iteration_size)) arguments['size'] = iteration_size arguments['start'] = search_obj.start # call the query function search_obj.service.query(**arguments) result_before_filtering = search_obj.service.get_result() logger.debug("Search %s - result size before filtering is %s...", search_id, str(len(result_before_filtering[REFERENCE_DSET]))) if _traversed(start=search_obj.start, result=result_before_filtering, max_size=max_size): logger.debug("Search %s - traverse of group complete...", search_id) break search_obj.index_marker = _increase_search_index(index_marker=search_obj.index_marker, start=search_obj.start, iteration_size=iteration_size, max_size=max_size, result=result_before_filtering) # after search index is increased, we can apply restrictions search_obj.service.apply_restrictions(**restriction_dictionary) result_after_filtering = search_obj.service.get_result() logger.debug("Search %s - result size after filtering is %s...", search_id, str(len(result_after_filtering[REFERENCE_DSET]))) search_obj.datasets = dataset_utils.extend_dsets_with_subset(search_obj.datasets, result_after_filtering) search_obj.start = search_obj.start + iteration_size iteration_size = _next_iteration_size(size=search_obj.size, datasets=search_obj.datasets) if _search_complete(size=search_obj.size, datasets=search_obj.datasets): logger.debug("Search %s - search complete, gathered plethora of elements needed...", search_id) break logger.debug("Search %s - search completed. Returning index marker %s", search_id, str(search_obj.index_marker)) search_obj.service.close_file() return search_obj.datasets, search_obj.index_marker
def test_get_chr_1_second_range_loop_5(self): start = 0 size = 5 bp_interval = IntInterval().set_string_tuple("1200001:49200000") looped_through = 1 d = utils.create_dictionary_of_empty_dsets(TO_QUERY_DSETS) datasets, index_marker = self.searcher.search_chromosome(chromosome=1, start=start, size=size, bp_interval=bp_interval) d = utils.extend_dsets_with_subset(d, datasets) while len(datasets[REFERENCE_DSET]) > 0: assert_studies_in_list(datasets, ['s1', 's3']) assert_datasets_have_size(datasets, TO_QUERY_DSETS, 5) start = start + index_marker datasets, index_marker = self.searcher.search_chromosome(chromosome=1, start=start, size=size, bp_interval=bp_interval) d = utils.extend_dsets_with_subset(d, datasets) looped_through += 1 assert looped_through == 11 assert len(set(d[SNP_DSET])) == len(d[SNP_DSET])
def test_get_dsets_group(self): chr_group_2 = gu.Group(self.f.get("/2")) bp_interval = IntInterval().set_tuple(48500000, 48500000) block = bk.Block(bp_interval) block_groups = block.get_block_groups_from_parent(chr_group_2) block_group = next(block_groups) block_sub_groups = block_group.get_all_subgroups() d = du.create_dictionary_of_empty_dsets(TO_QUERY_DSETS) for block_sub_group in block_sub_groups: datasets = query.get_dsets_from_group(block_sub_group, self.start, self.size) assert len(datasets) == len(TO_STORE_DSETS) d = du.extend_dsets_with_subset(d, datasets) for dset_name, dset in d.items(): if dset_name is STUDY_DSET: assert len(set(dset)) == 3 else: assert len(set(dset)) == 1
def _extend_datasets(self, result): self.datasets = utils.extend_dsets_with_subset(self.datasets, result)