def _get_dict_of_h5_to_study_groups(self, h5file, hf_study_dict): file = h5py.File(h5file, 'r') file_group = gu.Group(file) chr_groups = file_group.get_all_subgroups() block_groups = gu.generate_subgroups_from_generator_of_subgroups( chr_groups) study_groups = gu.generate_subgroups_from_generator_of_subgroups( block_groups) for study_group in study_groups: if self.study == study_group.get_name().split("/")[-1]: hf_study_dict[h5file].append(study_group.get_name()) file.close() return hf_study_dict
def query(self, chromosome, start, size, study=None): print("starting query") logger.debug("Starting query for chromosome %s, start %s, and size %s", str(chromosome), str(start), str(size)) chr_group = self.file_group.get_subgroup(chromosome) self.study = study print("got chr group") if study and not self.file.visit(self.check_study_is_group): raise NotFoundError("Study " + str(self.study)) self.datasets = query.create_empty_dataset() else: print("getting subs") all_chr_sub_groups = chr_group.get_all_subgroups() # we need to get all the study level subgroups from the bp range subgroups print("getting sub-subs") all_subgroups = gu.generate_subgroups_from_generator_of_subgroups( all_chr_sub_groups) print("getting datasets") self.datasets = query.load_datasets_from_groups( all_subgroups, start, size, study) logger.debug( "Query for chromosome %s, start %s, and size %s done...", str(chromosome), str(start), str(size))
def test_get_dsets_from_plethora_of_blocks(self): chr_group_2 = gu.Group(self.f.get("/2")) bp_interval = IntInterval().set_tuple(48500000, 49200000) block = bk.Block(bp_interval) block_groups = block.get_block_groups_from_parent(chr_group_2) all_subgroups = gu.generate_subgroups_from_generator_of_subgroups( block_groups) datasets = query.load_datasets_from_groups(all_subgroups, self.start, self.size) assert datasets.__class__ is dict for dset_name in TO_QUERY_DSETS: # 2 values for each of 3 studies that we loaded assert len(datasets[dset_name]) == 6 bp_interval = IntInterval().set_tuple(48600000, 48600000) block = bk.Block(bp_interval) block_groups = block.get_block_groups_from_parent(chr_group_2) datasets = query.load_datasets_from_groups(block_groups, self.start, self.size) for dset_name in TO_QUERY_DSETS: # no SNP bp falls into this group assert len(datasets[dset_name]) == 0
def get_chromosome_size(self, chromosome): chromosome_group = self.file_group.get_subgroup(chromosome) all_chr_sub_groups = chromosome_group.get_all_subgroups() all_subgroups = gu.generate_subgroups_from_generator_of_subgroups( all_chr_sub_groups) #size = sum(sub_group.get_max_group_size() for sub_group in all_subgroups) size = chromosome_group.get_attribute("size") logger.debug("Chromosome %s group size is %s", str(chromosome), str(size)) print(size) return size
def _get_dict_of_h5_to_study_groups(self, h5file, hf_study_dict): file = h5py.File(h5file, 'r') file_group = gu.Group(file) snp_groups = file_group.get_all_subgroups() study_groups = gu.generate_subgroups_from_generator_of_subgroups( snp_groups) for study_group in study_groups: if self.study == study_group.get_name().split("/")[-1]: snp_group = study_group.get_parent() if len(snp_group.get_all_subgroups_keys()) == 1: hf_study_dict[h5file].append(snp_group.get_name()) else: hf_study_dict[h5file].append(study_group.get_name()) file.close() return hf_study_dict
def query(self, chromosome, bp_interval, start, size): logger.debug( "Starting query for chromosome: %s, bp floor: %s, bp ceil: %s start: %s, size: %s", str(chromosome), str(bp_interval.floor()), str(bp_interval.ceil()), str(start), str(size)) chr_group = self.file_group.get_subgroup(chromosome) block = bk.Block(bp_interval) logger.debug("Block interval floor and ceiling: %s, %s", str(block.floor_block), str(block.ceil_block)) filter_block_ceil = None filter_block_floor = None # for block size 100, if I say I want BP range 250 - 350 that means # I need to search for block 300 (200-300) and block 400 (300-400) block_groups = block.get_block_groups_from_parent(chr_group) # we might need to filter further if they don't fit exactly # e.g. we got the snps for range 200-400 now we need to filter 250-350 if block.floor_block != bp_interval.floor(): filter_block_floor = bp_interval.floor() if block.ceil_block != bp_interval.ceil(): filter_block_ceil = bp_interval.ceil() if filter_block_ceil is None and filter_block_floor is None and bp_interval.floor( ) == bp_interval.ceil(): filter_block_floor = bp_interval.floor() filter_block_ceil = bp_interval.ceil() all_subgroups = gu.generate_subgroups_from_generator_of_subgroups( block_groups) datasets = query.load_datasets_from_groups(all_subgroups, start, size) bp_mask = datasets[BP_DSET].interval_mask(filter_block_floor, filter_block_ceil) logger.debug("BP mask is: %s", str(bp_mask)) if bp_mask is not None: logger.debug("Applying bp mask, size before filter: %s", str(len(datasets[REFERENCE_DSET]))) datasets = utils.filter_dictionary_by_mask(datasets, bp_mask) logger.debug("Applying bp mask, size after filter: %s", str(len(datasets[REFERENCE_DSET]))) logger.debug( "Starting query for chromosome: %s, bp floor: %s, bp ceil: %s start: %s, size: %s done...", str(chromosome), str(bp_interval.floor()), str(bp_interval.ceil()), str(start), str(size)) self.datasets = datasets
def get_block_range_size(self, chromosome, bp_interval): """ For a bp interval we create the Block object which in turn returns the (Group object) block groups that belong to this interval. We then sum up the group sizes. :param chromosome: The chromosome number we are intereted in :param bp_interval: the bp_interval we are interested in (needs to be an IntInterval) :return: the total group size of the block range given """ chr_group = self.file_group.get_subgroup(chromosome) block = bk.Block(bp_interval) block_groups = block.get_block_groups_from_parent(chr_group) all_subgroups = gu.generate_subgroups_from_generator_of_subgroups( block_groups) size = sum(bp_group.get_max_group_size() for bp_group in all_subgroups) logger.debug("Size of block group in range: %s, %s is %s", str(bp_interval.floor()), str(bp_interval.ceil()), size) return size
def get_study_groups(self): trait_groups = self.file_group.get_all_subgroups() study_groups = gu.generate_subgroups_from_generator_of_subgroups(trait_groups) return study_groups