def test_cbgen_invalid_metafile(): mfilepath = example.get("wrong.metadata") with pytest.raises(RuntimeError): bgen_metafile(mfilepath) mfilepath = example.get("haplotypes.bgen.metadata.corrupted") with pytest.raises(RuntimeError): bgen_metafile(mfilepath)
def __init__( self, path: PathType, metafile_path: Optional[PathType] = None, dtype: DType = "float32", ) -> None: self.path = Path(path) self.metafile_path = ( Path(metafile_path) if metafile_path else self.path.with_suffix(".metafile") ) with bgen_file(self.path) as bgen: self.n_variants = bgen.nvariants self.n_samples = bgen.nsamples if not self.metafile_path.exists(): start = time.time() logger.info( f"Generating BGEN metafile for '{self.path}' (this may take a while)" ) bgen.create_metafile(self.metafile_path, verbose=False) stop = time.time() logger.info( f"BGEN metafile generation complete ({stop - start:.0f} seconds)" ) with bgen_metafile(self.metafile_path) as mf: assert self.n_variants == mf.nvariants self.npartitions = mf.npartitions self.partition_size = mf.partition_size self.shape = (self.n_variants, self.n_samples, 3) self.dtype = np.dtype(dtype) self.precision = 64 if self.dtype.itemsize >= 8 else 32 self.ndim = 3
def __getitem__(self, idx: Any) -> np.ndarray: if not isinstance(idx, tuple): raise IndexError(f"Indexer must be tuple (received {type(idx)})") if len(idx) != self.ndim: raise IndexError( f"Indexer must have {self.ndim} items (received {len(idx)} slices)" ) if not all(isinstance(i, slice) or isinstance(i, int) for i in idx): raise IndexError( f"Indexer must contain only slices or ints (received types {[type(i) for i in idx]})" ) # Determine which dims should have unit size in result squeeze_dims = tuple(i for i in range(len(idx)) if isinstance(idx[i], int)) # Convert all indexers to slices idx = tuple(slice(i, i + 1) if isinstance(i, int) else i for i in idx) if idx[0].start == idx[0].stop: return np.empty((0, ) * self.ndim, dtype=self.dtype) # Determine start and end partitions that correspond to the # given variant dimension indexer start_partition = idx[0].start // self.partition_size start_partition_offset = idx[0].start % self.partition_size end_partition = (idx[0].stop - 1) // self.partition_size end_partition_offset = (idx[0].stop - 1) % self.partition_size # Create a list of all offsets into the underlying file at which # data for each variant begins all_vaddr = [] with bgen_metafile(self.metafile_path) as mf: for i in range(start_partition, end_partition + 1): partition = mf.read_partition(i) start_offset = start_partition_offset if i == start_partition else 0 end_offset = (end_partition_offset + 1 if i == end_partition else self.partition_size) vaddr = partition.variants.offset all_vaddr.extend(vaddr[start_offset:end_offset].tolist()) # Read the probabilities for each variant, apply indexer for # samples dimension to give probabilities for all genotypes, # and then apply final genotype dimension indexer with bgen_file(self.path) as bgen: res = None for i, vaddr in enumerate(all_vaddr): probs = bgen.read_probability(vaddr, precision=self.precision)[idx[1]] assert len(probs.shape) == 2 and probs.shape[1] == 3 if res is None: res = np.zeros((len(all_vaddr), len(probs), 3), dtype=self.dtype) res[i] = probs res = res[..., idx[2]] # type: ignore[index] return np.squeeze(res, axis=squeeze_dims) # type: ignore[no-any-return]
def read_metafile(path: PathType) -> dd.DataFrame: """Read cbgen metafile containing partitioned variant info""" with bgen_metafile(path) as mf: divisions = [mf.partition_size * i for i in range(mf.npartitions)] + [mf.nvariants - 1] dfs = [ dask.delayed(_read_metafile_partition)(path, i) for i in range(mf.npartitions) ] meta = dd.utils.make_meta(METAFILE_DTYPE) return dd.from_delayed(dfs, meta=meta, divisions=divisions)
def _read_metafile_partition(path: Path, partition: int) -> pd.DataFrame: with bgen_metafile(path) as mf: part = mf.read_partition(partition) v = part.variants allele_ids = np.array([_split_alleles(aid) for aid in v.allele_ids]) data = { "id": v.id, "rsid": v.rsid, "chrom": v.chromosome, "pos": v.position, "a1": allele_ids[:, 0], "a2": allele_ids[:, 1], "offset": v.offset, } return pd.DataFrame(data).astype(METAFILE_DTYPE)
def _read_partition(filepath: Path, partition: int) -> DataFrame: with bgen_metafile(filepath) as mf: part: Partition = mf.read_partition(partition) v = part.variants data = { "id": v.id.astype(str), "rsid": v.rsid.astype(str), "chrom": v.chromosome.astype(str), "pos": v.position.astype(int), "nalleles": v.nalleles.astype(int), "allele_ids": v.allele_ids.astype(str), "vaddr": v.offset.astype(int), } df = DataFrame(data) return df[[ "id", "rsid", "chrom", "pos", "nalleles", "allele_ids", "vaddr" ]]
def test_cbgen_large1(tmp_path): filepath = example.get("merged_487400x220000.bgen") mfilepath = tmp_path / f"{filepath.name}.metafile" bgen = bgen_file(filepath) assert bgen.nvariants == 220000 assert bgen.nsamples == 487400 assert not bgen.contain_samples with pytest.raises(RuntimeError): bgen.read_samples() bgen.create_metafile(mfilepath, verbose=True) mf = bgen_metafile(mfilepath) assert mf.filepath.name == mfilepath.name assert mf.npartitions == 469 assert mf.nvariants == 220000 assert mf.partition_size == 470 part = mf.read_partition(5) assert len(part.variants) == 470 assert part.variants.id[0] == b"sid_1_2350" assert part.variants.rsid[0] == b"sid_1_2350" assert part.variants.chrom[0] == b"1" assert part.variants.position[0] == 2351 assert part.variants.nalleles[0] == 2 assert part.variants.allele_ids[0] == b"A,C" voff = part.variants.offset[0] gt = bgen.read_genotype(voff) assert_allclose(gt.probs.shape, (487400, 3)) assert_allclose(nansum(gt.probs, 0), [475743.0, 0.0, 0.0]) assert_allclose(isnan(gt.probs).sum(0), [11657, 11657, 11657]) assert not gt.phased assert_allclose(gt.ploidy.sum(), 974800) assert_allclose(gt.missing.sum(), 11657) mf.close() bgen.close()
def read_genotype(i: int): with bgen_metafile(metafile_filepath) as mf: part_size = mf.partition_size part = i // part_size j = i % part_size p = mf.read_partition(part) nsub_parts = _estimate_best_nsub_parts(nsamples, part_size) spart_size = max(1, part_size // nsub_parts) sub_part = j // spart_size m = j % spart_size start = sub_part * spart_size variants = p.variants end = min(variants.size, (sub_part + 1) * spart_size) vaddrs = tuple(p.variants.offset[start:end].tolist()) g: List[Genotype] = read_genotype_partition(bgen_filepath, vaddrs) gm = g[m] return { "probs": gm.probability, "phased": gm.phased, "ploidy": gm.ploidy, "missing": gm.missing, }
def read_bgen( filepath: Union[str, Path], metafile_filepath: Optional[Union[str, Path]] = None, samples_filepath: Optional[Union[str, Path]] = None, verbose: bool = True, ): """ Read a given BGEN file. Parameters ---------- filepath Bgen file path. metafile_filepath File path to the corresponding metafile. A metafile can be created by calling :func:`bgen_reader.create_metafile`. If ``None``, a metafile will be automatically created. Defaults to ``None``. samples_filepath Path to a `sample format`_ file or ``None`` to read samples from the bgen file itself. Defaults to ``None``. verbose ``True`` to show progress; ``False`` otherwise. Defaults to ``True``. Returns ------- variants : :class:`dask.dataFrame.DataFrame` Variant position, chromosomes, rsids, etc. samples : :class:`pandas.Series` Sample identifications. genotype : list List of genotypes. Examples -------- .. doctest:: >>> from bgen_reader import example_filepath, read_bgen >>> >>> bgen = read_bgen(example_filepath("haplotypes.bgen"), verbose=False) >>> variants = bgen["variants"] >>> samples = bgen["samples"] >>> >>> v = variants.loc[0].compute() >>> g = bgen["genotype"][0].compute() >>> print(v) id rsid chrom pos nalleles allele_ids vaddr 0 SNP1 RS1 1 1 2 A,G 102 >>> print(samples) 0 sample_0 1 sample_1 2 sample_2 3 sample_3 Name: id, dtype: object >>> print(g["probs"][0]) [1. 0. 1. 0.] .. _sample format: https://www.well.ox.ac.uk/~gav/qctool/documentation/sample_file_formats.html """ filepath = Path(filepath) assert_file_exist(filepath) assert_file_readable(filepath) if metafile_filepath is None: metafile_filepath = infer_metafile_filepath(filepath) else: metafile_filepath = Path(metafile_filepath) assert_file_exist(metafile_filepath) assert_file_readable(filepath) if not metafile_filepath.exists(): if verbose: print( f"We will create the metafile `{metafile_filepath}`. This file will " "speed up further\nreads and only need to be created once. So, please, " "bear with me.") create_metafile(filepath, metafile_filepath, verbose) elif os.path.getmtime(metafile_filepath) < os.path.getmtime(filepath): from ._genotype import cache as bgencache from ._metafile import cache as metacache metacache.clear() bgencache.clear() if verbose: print( f"File `{filepath}` has been modified after the creation of `{metafile_filepath}`." "\nWe will therefore recreate the metadata file. So, please, bear with me." ) os.unlink(metafile_filepath) create_metafile(filepath, metafile_filepath, verbose) with bgen_file(filepath) as bgen: samples = _get_samples(bgen, samples_filepath, verbose) with bgen_metafile(metafile_filepath) as mf: nvariants = mf.nvariants npartitions = mf.npartitions part_size = mf.partition_size variants = create_variants(metafile_filepath, nvariants, npartitions, part_size) genotype = create_genotypes(bgen, metafile_filepath, verbose) return dict(variants=variants, samples=samples, genotype=genotype)
def test_cbgen_phased_genotype(tmp_path): filepath = example.get("haplotypes.bgen") mfilepath = tmp_path / f"{filepath.name}.metafile" bgen = bgen_file(filepath) assert bgen.filepath.name == "haplotypes.bgen" assert bgen.nvariants == 4 assert bgen.nsamples == 4 assert bgen.contain_samples samples = bgen.read_samples() assert_array_equal(samples, [b"sample_0", b"sample_1", b"sample_2", b"sample_3"]) bgen.create_metafile(mfilepath, verbose=False) mf = bgen_metafile(mfilepath) assert mf.filepath.name == mfilepath.name assert mf.npartitions == 1 assert mf.nvariants == 4 assert mf.partition_size == 4 part = mf.read_partition(0) assert part.variants.id[0] == b"SNP1" assert part.variants.rsid[0] == b"RS1" assert part.variants.chrom[0] == b"1" assert part.variants.position[0] == 1 assert part.variants.nalleles[0] == 2 assert part.variants.allele_ids[0] == b"A,G" voff = part.variants.offset[0] gt = bgen.read_genotype(voff) assert_allclose( gt.probs, [ [1.0, 0.0, 1.0, 0.0], [0.0, 1.0, 1.0, 0.0], [1.0, 0.0, 0.0, 1.0], [0.0, 1.0, 0.0, 1.0], ], ) assert gt.phased assert_allclose(gt.ploidy, [2, 2, 2, 2]) assert_allclose(gt.missing, [False, False, False, False]) assert part.variants.id[3] == b"SNP4" assert part.variants.rsid[3] == b"RS4" assert part.variants.chrom[3] == b"1" assert part.variants.position[3] == 4 assert part.variants.nalleles[3] == 2 assert part.variants.allele_ids[3] == b"A,G" voff = part.variants.offset[3] gt = bgen.read_genotype(voff) assert_allclose( gt.probs, [ [0.0, 1.0, 0.0, 1.0], [1.0, 0.0, 1.0, 0.0], [0.0, 1.0, 1.0, 0.0], [1.0, 0.0, 0.0, 1.0], ], ) assert gt.phased assert_allclose(gt.ploidy, [2, 2, 2, 2]) assert_allclose(gt.missing, [False, False, False, False]) mf.close() bgen.close()
def test_cbgen_complex_unphased(tmp_path: Path): filepath = example.get("complex.23bits.no.samples.bgen") mfilepath = tmp_path / f"{filepath.name}.metafile" with bgen_file(filepath) as bgen: assert bgen.filepath.name == "complex.23bits.no.samples.bgen" assert bgen.nvariants == 10 assert bgen.nsamples == 4 assert not bgen.contain_samples with pytest.raises(RuntimeError): bgen.read_samples() bgen.create_metafile(mfilepath, verbose=False) with bgen_metafile(mfilepath) as mf: assert mf.filepath.name == mfilepath.name assert mf.npartitions == 1 assert mf.nvariants == 10 assert mf.partition_size == 10 part = mf.read_partition(0) assert part.variants.id[0] == b"" assert part.variants.rsid[0] == b"V1" assert part.variants.chrom[0] == b"01" assert part.variants.position[0] == 1 assert part.variants.nalleles[0] == 2 assert part.variants.allele_ids[0] == b"A,G" with bgen_file(filepath) as bgen: voff = part.variants.offset[0] gt = bgen.read_genotype(voff) assert_allclose( gt.probs, [[1.0, 0.0, nan], [1.0, 0.0, 0.0], [1.0, 0.0, 0.0], [0.0, 1.0, 0.0]], ) assert not gt.phased assert_allclose(gt.ploidy, [1, 2, 2, 2]) assert_allclose(gt.missing, [False, False, False, False]) voff = part.variants.offset[-1] gt = bgen.read_genotype(voff) assert_allclose( gt.probs, [ [1.0, 0.0, 0.0, 0.0, 0.0], [0.0, 1.0, 0.0, 0.0, 0.0], [0.0, 0.0, 1.0, 0.0, 0.0], [0.0, 0.0, 0.0, 1.0, 0.0], ], ) assert not gt.phased assert_allclose(gt.ploidy, [4, 4, 4, 4]) assert_allclose(gt.missing, [False, False, False, False]) valid_offsets = set(list(part.variants.offset)) all_offsets = set(list(range(0, int(max(valid_offsets)) + 1))) invalid_offsets = all_offsets - valid_offsets for offset in list(invalid_offsets): with pytest.raises(RuntimeError): bgen.read_genotype(offset) with pytest.raises(RuntimeError): part = mf.read_partition(1)
def time_read_partitions(self): with cbgen.bgen_metafile(self._mfilepath) as mf: for i in range(mf.npartitions): mf.read_partition(i)
def time_bgen_metafile(self): with cbgen.bgen_metafile(self._mfilepath): pass