def _get_dict_of_h5_to_study_groups(self, h5file, hf_study_dict):
     file = h5py.File(h5file, 'r')
     file_group = gu.Group(file)
     chr_groups = file_group.get_all_subgroups()
     block_groups = gu.generate_subgroups_from_generator_of_subgroups(
         chr_groups)
     study_groups = gu.generate_subgroups_from_generator_of_subgroups(
         block_groups)
     for study_group in study_groups:
         if self.study == study_group.get_name().split("/")[-1]:
             hf_study_dict[h5file].append(study_group.get_name())
     file.close()
     return hf_study_dict
示例#2
0
    def query(self, chromosome, start, size, study=None):
        print("starting query")
        logger.debug("Starting query for chromosome %s, start %s, and size %s",
                     str(chromosome), str(start), str(size))
        chr_group = self.file_group.get_subgroup(chromosome)

        self.study = study

        print("got chr group")
        if study and not self.file.visit(self.check_study_is_group):
            raise NotFoundError("Study " + str(self.study))
            self.datasets = query.create_empty_dataset()

        else:

            print("getting subs")
            all_chr_sub_groups = chr_group.get_all_subgroups()

            # we need to get all the study level subgroups from the bp range subgroups
            print("getting sub-subs")
            all_subgroups = gu.generate_subgroups_from_generator_of_subgroups(
                all_chr_sub_groups)

            print("getting datasets")
            self.datasets = query.load_datasets_from_groups(
                all_subgroups, start, size, study)
            logger.debug(
                "Query for chromosome %s, start %s, and size %s done...",
                str(chromosome), str(start), str(size))
    def test_get_dsets_from_plethora_of_blocks(self):
        chr_group_2 = gu.Group(self.f.get("/2"))

        bp_interval = IntInterval().set_tuple(48500000, 49200000)
        block = bk.Block(bp_interval)
        block_groups = block.get_block_groups_from_parent(chr_group_2)
        all_subgroups = gu.generate_subgroups_from_generator_of_subgroups(
            block_groups)

        datasets = query.load_datasets_from_groups(all_subgroups, self.start,
                                                   self.size)
        assert datasets.__class__ is dict

        for dset_name in TO_QUERY_DSETS:
            # 2 values for each of 3 studies that we loaded
            assert len(datasets[dset_name]) == 6

        bp_interval = IntInterval().set_tuple(48600000, 48600000)
        block = bk.Block(bp_interval)
        block_groups = block.get_block_groups_from_parent(chr_group_2)

        datasets = query.load_datasets_from_groups(block_groups, self.start,
                                                   self.size)
        for dset_name in TO_QUERY_DSETS:
            # no SNP bp falls into this group
            assert len(datasets[dset_name]) == 0
示例#4
0
 def get_chromosome_size(self, chromosome):
     chromosome_group = self.file_group.get_subgroup(chromosome)
     all_chr_sub_groups = chromosome_group.get_all_subgroups()
     all_subgroups = gu.generate_subgroups_from_generator_of_subgroups(
         all_chr_sub_groups)
     #size = sum(sub_group.get_max_group_size() for sub_group in all_subgroups)
     size = chromosome_group.get_attribute("size")
     logger.debug("Chromosome %s group size is %s", str(chromosome),
                  str(size))
     print(size)
     return size
 def _get_dict_of_h5_to_study_groups(self, h5file, hf_study_dict):
     file = h5py.File(h5file, 'r')
     file_group = gu.Group(file)
     snp_groups = file_group.get_all_subgroups()
     study_groups = gu.generate_subgroups_from_generator_of_subgroups(
         snp_groups)
     for study_group in study_groups:
         if self.study == study_group.get_name().split("/")[-1]:
             snp_group = study_group.get_parent()
             if len(snp_group.get_all_subgroups_keys()) == 1:
                 hf_study_dict[h5file].append(snp_group.get_name())
             else:
                 hf_study_dict[h5file].append(study_group.get_name())
     file.close()
     return hf_study_dict
示例#6
0
    def query(self, chromosome, bp_interval, start, size):
        logger.debug(
            "Starting query for chromosome: %s, bp floor: %s, bp ceil: %s start: %s, size: %s",
            str(chromosome), str(bp_interval.floor()), str(bp_interval.ceil()),
            str(start), str(size))
        chr_group = self.file_group.get_subgroup(chromosome)
        block = bk.Block(bp_interval)
        logger.debug("Block interval floor and ceiling: %s, %s",
                     str(block.floor_block), str(block.ceil_block))

        filter_block_ceil = None
        filter_block_floor = None
        # for block size 100, if I say I want BP range 250 - 350 that means
        # I need to search for block 300 (200-300) and block 400 (300-400)

        block_groups = block.get_block_groups_from_parent(chr_group)

        # we might need to filter further if they don't fit exactly
        # e.g. we got the snps for range 200-400 now we need to filter 250-350
        if block.floor_block != bp_interval.floor():
            filter_block_floor = bp_interval.floor()
        if block.ceil_block != bp_interval.ceil():
            filter_block_ceil = bp_interval.ceil()
        if filter_block_ceil is None and filter_block_floor is None and bp_interval.floor(
        ) == bp_interval.ceil():
            filter_block_floor = bp_interval.floor()
            filter_block_ceil = bp_interval.ceil()

        all_subgroups = gu.generate_subgroups_from_generator_of_subgroups(
            block_groups)

        datasets = query.load_datasets_from_groups(all_subgroups, start, size)
        bp_mask = datasets[BP_DSET].interval_mask(filter_block_floor,
                                                  filter_block_ceil)

        logger.debug("BP mask is: %s", str(bp_mask))

        if bp_mask is not None:
            logger.debug("Applying bp mask, size before filter: %s",
                         str(len(datasets[REFERENCE_DSET])))
            datasets = utils.filter_dictionary_by_mask(datasets, bp_mask)
            logger.debug("Applying bp mask, size after filter: %s",
                         str(len(datasets[REFERENCE_DSET])))
        logger.debug(
            "Starting query for chromosome: %s, bp floor: %s, bp ceil: %s start: %s, size: %s done...",
            str(chromosome), str(bp_interval.floor()), str(bp_interval.ceil()),
            str(start), str(size))
        self.datasets = datasets
示例#7
0
 def get_block_range_size(self, chromosome, bp_interval):
     """
     For a bp interval we create the Block object which in turn returns the (Group object)
     block groups that belong to this interval. We then sum up the group sizes.
     :param chromosome: The chromosome number we are intereted in
     :param bp_interval: the bp_interval we are interested in (needs to be an IntInterval)
     :return: the total group size of the block range given
     """
     chr_group = self.file_group.get_subgroup(chromosome)
     block = bk.Block(bp_interval)
     block_groups = block.get_block_groups_from_parent(chr_group)
     all_subgroups = gu.generate_subgroups_from_generator_of_subgroups(
         block_groups)
     size = sum(bp_group.get_max_group_size() for bp_group in all_subgroups)
     logger.debug("Size of block group in range: %s, %s is %s",
                  str(bp_interval.floor()), str(bp_interval.ceil()), size)
     return size
 def get_study_groups(self):
     trait_groups = self.file_group.get_all_subgroups()
     study_groups = gu.generate_subgroups_from_generator_of_subgroups(trait_groups)
     return study_groups