def __getitem__(self, idx): if not isinstance(idx, tuple): raise IndexError(f'Indexer must be tuple (received {type(idx)})') if len(idx) != self.ndim: raise IndexError( f'Indexer must be two-item tuple (received {len(idx)} slices)') if idx[0].start == idx[0].stop: return np.empty((0, 0), dtype=self.dtype) start_partition = idx[0].start // self.partition_size start_partition_offset = idx[0].start % self.partition_size end_partition = (idx[0].stop - 1) // self.partition_size end_partition_offset = (idx[0].stop - 1) % self.partition_size all_vaddr = [] with bgen_metafile(self.metafile_filepath) as mf: for i in range(start_partition, end_partition + 1): partition = mf.read_partition(i) start_offset = start_partition_offset if i == start_partition else 0 end_offset = end_partition_offset + 1 if i == end_partition else self.partition_size vaddr = partition["vaddr"].tolist() all_vaddr.extend(vaddr[start_offset:end_offset]) with bgen_file(self.path) as bgen: genotypes = [bgen.read_genotype(vaddr) for vaddr in all_vaddr] probs = [genotype["probs"] for genotype in genotypes] return np.stack(probs)[:, idx[1]]
def __init__(self, path, dtype=np.float32): self.path = Path(path) self.metafile_filepath = _infer_metafile_filepath(Path(self.path)) if not self.metafile_filepath.exists(): create_metafile(path, self.metafile_filepath, verbose=False) with bgen_metafile(self.metafile_filepath) as mf: self.n_variants = mf.nvariants self.npartitions = mf.npartitions self.partition_size = mf.partition_size # This may need chunking for large numbers of variants variants_df = mf.create_variants().compute() self.variant_id = variants_df["id"].tolist() self.contig = variants_df["chrom"].tolist() self.pos = variants_df["pos"].tolist() allele_ids = variants_df["allele_ids"].tolist() self.a1, self.a2 = tuple(zip(*[id.split(",") for id in allele_ids])) with bgen_file(self.path) as bgen: sample_path = self.path.with_suffix('.sample') if sample_path.exists(): self.samples = read_samples_file(sample_path, verbose=False) else: if bgen.contain_samples: self.samples = bgen.read_samples() else: self.samples = generate_samples(bgen.nsamples) self.shape = (self.n_variants, len(self.samples), 3) self.dtype = dtype self.ndim = 3
def __init__( self, path: PathType, persist: bool = True, dtype: Any = np.float32 ) -> None: self.path = Path(path) self.metafile_filepath = infer_metafile_filepath(Path(self.path)) if not self.metafile_filepath.exists(): create_metafile(path, self.metafile_filepath, verbose=False) with bgen_metafile(self.metafile_filepath) as mf: self.n_variants = mf.nvariants self.npartitions = mf.npartitions self.partition_size = mf.partition_size df = mf.create_variants() if persist: df = df.persist() variant_arrs = _to_dict(df, dtype=VARIANT_ARRAY_DTYPE) self.variant_id = variant_arrs["id"] self.contig = variant_arrs["chrom"] self.pos = variant_arrs["pos"] def split_alleles( alleles: np.ndarray, block_info: Any = None ) -> np.ndarray: if block_info is None or len(block_info) == 0: return alleles def split(allele_row: np.ndarray) -> np.ndarray: alleles_list = allele_row[0].split(",") assert len(alleles_list) == 2 # bi-allelic return np.array(alleles_list) return np.apply_along_axis(split, 1, alleles[:, np.newaxis]) variant_alleles = variant_arrs["allele_ids"].map_blocks(split_alleles) def max_str_len(arr: ArrayLike) -> Any: return arr.map_blocks( lambda s: np.char.str_len(s.astype(str)), dtype=np.int8 ).max() max_allele_length = max(max_str_len(variant_alleles).compute()) self.variant_alleles = variant_alleles.astype(f"S{max_allele_length}") with bgen_file(self.path) as bgen: sample_path = self.path.with_suffix(".sample") if sample_path.exists(): self.sample_id = read_samples_file(sample_path, verbose=False) else: if bgen.contain_samples: self.sample_id = bgen.read_samples() else: self.sample_id = generate_samples(bgen.nsamples) self.shape = (self.n_variants, len(self.sample_id), 3) self.dtype = dtype self.ndim = 3
def __getitem__(self, idx: Any) -> np.ndarray: if not isinstance(idx, tuple): raise IndexError(f"Indexer must be tuple (received {type(idx)})") if len(idx) != self.ndim: raise IndexError( f"Indexer must have {self.ndim} items (received {len(idx)} slices)" ) if not all(isinstance(i, slice) or isinstance(i, int) for i in idx): raise IndexError( f"Indexer must contain only slices or ints (received types {[type(i) for i in idx]})" ) # Determine which dims should have unit size in result squeeze_dims = tuple(i for i in range(len(idx)) if isinstance(idx[i], int)) # Convert all indexers to slices idx = tuple(slice(i, i + 1) if isinstance(i, int) else i for i in idx) if idx[0].start == idx[0].stop: return np.empty((0,) * self.ndim, dtype=self.dtype) # Determine start and end partitions that correspond to the # given variant dimension indexer start_partition = idx[0].start // self.partition_size start_partition_offset = idx[0].start % self.partition_size end_partition = (idx[0].stop - 1) // self.partition_size end_partition_offset = (idx[0].stop - 1) % self.partition_size # Create a list of all offsets into the underlying file at which # data for each variant begins all_vaddr = [] with bgen_metafile(self.metafile_filepath) as mf: for i in range(start_partition, end_partition + 1): partition = mf.read_partition(i) start_offset = start_partition_offset if i == start_partition else 0 end_offset = ( end_partition_offset + 1 if i == end_partition else self.partition_size ) vaddr = partition["vaddr"].tolist() all_vaddr.extend(vaddr[start_offset:end_offset]) # Read the probabilities for each variant, apply indexer for # samples dimension to give probabilities for all genotypes, # and then apply final genotype dimension indexer with bgen_file(self.path) as bgen: res = None for i, vaddr in enumerate(all_vaddr): probs = bgen.read_genotype(vaddr)["probs"][idx[1]] assert len(probs.shape) == 2 and probs.shape[1] == 3 if res is None: res = np.zeros((len(all_vaddr), len(probs), 3), dtype=self.dtype) res[i] = probs res = res[..., idx[2]] # type: ignore[index] return np.squeeze(res, axis=squeeze_dims)
def _map_metadata(self, metafile_filepath): with log_in_place("metadata", logging.INFO) as updater: with bgen_metafile(Path(metafile_filepath)) as mf: nparts = mf.npartitions ( id_list, rsid_list, chrom_list, position_list, vaddr_list, nalleles_list, allele_ids_list, ncombinations_list, phased_list, ) = ([], [], [], [], [], [], [], [], []) for ipart2 in range(nparts): # LATER multithread? # LATER in notebook this message doesn't appear on one line updater("step 2: part {0:,} of {1:,}".format( ipart2, nparts)) ( nvariants, vid, rsid, chrom, position, nalleles, allele_ids, offset, ) = _inner_read_partition(mf, ipart2) id_list.append(vid) rsid_list.append(rsid) chrom_list.append(chrom) position_list.append(position) nalleles_list.append(nalleles) allele_ids_list.append(allele_ids) vaddr_list.append(offset) # LATER use concatenate(...out=) instead self._ids = np.array(np.concatenate(id_list), dtype="str") # dtype needed to make unicode self._rsids = np.array(np.concatenate(rsid_list), dtype="str") self._vaddr = np.concatenate(vaddr_list) self._chromosomes = np.array(np.concatenate(chrom_list), dtype="str") self._positions = np.concatenate(position_list) self._nalleles = np.concatenate(nalleles_list) self._allele_ids = np.array(np.concatenate(allele_ids_list), dtype="str") for i, vaddr0 in enumerate(self._vaddr): if i % 1000 == 0: updater("step 3: part {0:,} of {1:,}".format( i, self.nvariants)) genotype = lib.bgen_file_open_genotype(self._bgen._bgen_file, vaddr0) ncombinations_list.append(lib.bgen_genotype_ncombs(genotype)) phased_list.append(lib.bgen_genotype_phased(genotype)) lib.bgen_genotype_close(genotype) self._ncombinations = np.array(ncombinations_list, dtype="int") self._phased = np.array(phased_list, dtype="bool")