示例#1
0
def test_cbgen_invalid_metafile():
    mfilepath = example.get("wrong.metadata")
    with pytest.raises(RuntimeError):
        bgen_metafile(mfilepath)

    mfilepath = example.get("haplotypes.bgen.metadata.corrupted")
    with pytest.raises(RuntimeError):
        bgen_metafile(mfilepath)
示例#2
0
    def __init__(
        self,
        path: PathType,
        metafile_path: Optional[PathType] = None,
        dtype: DType = "float32",
    ) -> None:
        self.path = Path(path)
        self.metafile_path = (
            Path(metafile_path) if metafile_path else self.path.with_suffix(".metafile")
        )

        with bgen_file(self.path) as bgen:
            self.n_variants = bgen.nvariants
            self.n_samples = bgen.nsamples

            if not self.metafile_path.exists():
                start = time.time()
                logger.info(
                    f"Generating BGEN metafile for '{self.path}' (this may take a while)"
                )
                bgen.create_metafile(self.metafile_path, verbose=False)
                stop = time.time()
                logger.info(
                    f"BGEN metafile generation complete ({stop - start:.0f} seconds)"
                )

            with bgen_metafile(self.metafile_path) as mf:
                assert self.n_variants == mf.nvariants
                self.npartitions = mf.npartitions
                self.partition_size = mf.partition_size

        self.shape = (self.n_variants, self.n_samples, 3)
        self.dtype = np.dtype(dtype)
        self.precision = 64 if self.dtype.itemsize >= 8 else 32
        self.ndim = 3
示例#3
0
    def __getitem__(self, idx: Any) -> np.ndarray:
        if not isinstance(idx, tuple):
            raise IndexError(f"Indexer must be tuple (received {type(idx)})")
        if len(idx) != self.ndim:
            raise IndexError(
                f"Indexer must have {self.ndim} items (received {len(idx)} slices)"
            )
        if not all(isinstance(i, slice) or isinstance(i, int) for i in idx):
            raise IndexError(
                f"Indexer must contain only slices or ints (received types {[type(i) for i in idx]})"
            )
        # Determine which dims should have unit size in result
        squeeze_dims = tuple(i for i in range(len(idx))
                             if isinstance(idx[i], int))
        # Convert all indexers to slices
        idx = tuple(slice(i, i + 1) if isinstance(i, int) else i for i in idx)

        if idx[0].start == idx[0].stop:
            return np.empty((0, ) * self.ndim, dtype=self.dtype)

        # Determine start and end partitions that correspond to the
        # given variant dimension indexer
        start_partition = idx[0].start // self.partition_size
        start_partition_offset = idx[0].start % self.partition_size
        end_partition = (idx[0].stop - 1) // self.partition_size
        end_partition_offset = (idx[0].stop - 1) % self.partition_size

        # Create a list of all offsets into the underlying file at which
        # data for each variant begins
        all_vaddr = []
        with bgen_metafile(self.metafile_path) as mf:
            for i in range(start_partition, end_partition + 1):
                partition = mf.read_partition(i)
                start_offset = start_partition_offset if i == start_partition else 0
                end_offset = (end_partition_offset +
                              1 if i == end_partition else self.partition_size)
                vaddr = partition.variants.offset
                all_vaddr.extend(vaddr[start_offset:end_offset].tolist())

        # Read the probabilities for each variant, apply indexer for
        # samples dimension to give probabilities for all genotypes,
        # and then apply final genotype dimension indexer
        with bgen_file(self.path) as bgen:
            res = None
            for i, vaddr in enumerate(all_vaddr):
                probs = bgen.read_probability(vaddr,
                                              precision=self.precision)[idx[1]]
                assert len(probs.shape) == 2 and probs.shape[1] == 3
                if res is None:
                    res = np.zeros((len(all_vaddr), len(probs), 3),
                                   dtype=self.dtype)
                res[i] = probs
            res = res[..., idx[2]]  # type: ignore[index]
            return np.squeeze(res,
                              axis=squeeze_dims)  # type: ignore[no-any-return]
示例#4
0
def read_metafile(path: PathType) -> dd.DataFrame:
    """Read cbgen metafile containing partitioned variant info"""
    with bgen_metafile(path) as mf:
        divisions = [mf.partition_size * i
                     for i in range(mf.npartitions)] + [mf.nvariants - 1]
        dfs = [
            dask.delayed(_read_metafile_partition)(path, i)
            for i in range(mf.npartitions)
        ]
        meta = dd.utils.make_meta(METAFILE_DTYPE)
        return dd.from_delayed(dfs, meta=meta, divisions=divisions)
示例#5
0
def _read_metafile_partition(path: Path, partition: int) -> pd.DataFrame:
    with bgen_metafile(path) as mf:
        part = mf.read_partition(partition)
    v = part.variants
    allele_ids = np.array([_split_alleles(aid) for aid in v.allele_ids])
    data = {
        "id": v.id,
        "rsid": v.rsid,
        "chrom": v.chromosome,
        "pos": v.position,
        "a1": allele_ids[:, 0],
        "a2": allele_ids[:, 1],
        "offset": v.offset,
    }
    return pd.DataFrame(data).astype(METAFILE_DTYPE)
示例#6
0
def _read_partition(filepath: Path, partition: int) -> DataFrame:
    with bgen_metafile(filepath) as mf:
        part: Partition = mf.read_partition(partition)
    v = part.variants
    data = {
        "id": v.id.astype(str),
        "rsid": v.rsid.astype(str),
        "chrom": v.chromosome.astype(str),
        "pos": v.position.astype(int),
        "nalleles": v.nalleles.astype(int),
        "allele_ids": v.allele_ids.astype(str),
        "vaddr": v.offset.astype(int),
    }
    df = DataFrame(data)
    return df[[
        "id", "rsid", "chrom", "pos", "nalleles", "allele_ids", "vaddr"
    ]]
示例#7
0
def test_cbgen_large1(tmp_path):
    filepath = example.get("merged_487400x220000.bgen")
    mfilepath = tmp_path / f"{filepath.name}.metafile"

    bgen = bgen_file(filepath)
    assert bgen.nvariants == 220000
    assert bgen.nsamples == 487400
    assert not bgen.contain_samples

    with pytest.raises(RuntimeError):
        bgen.read_samples()

    bgen.create_metafile(mfilepath, verbose=True)

    mf = bgen_metafile(mfilepath)

    assert mf.filepath.name == mfilepath.name
    assert mf.npartitions == 469
    assert mf.nvariants == 220000
    assert mf.partition_size == 470

    part = mf.read_partition(5)

    assert len(part.variants) == 470
    assert part.variants.id[0] == b"sid_1_2350"
    assert part.variants.rsid[0] == b"sid_1_2350"
    assert part.variants.chrom[0] == b"1"
    assert part.variants.position[0] == 2351
    assert part.variants.nalleles[0] == 2
    assert part.variants.allele_ids[0] == b"A,C"
    voff = part.variants.offset[0]
    gt = bgen.read_genotype(voff)
    assert_allclose(gt.probs.shape, (487400, 3))
    assert_allclose(nansum(gt.probs, 0), [475743.0, 0.0, 0.0])
    assert_allclose(isnan(gt.probs).sum(0), [11657, 11657, 11657])
    assert not gt.phased
    assert_allclose(gt.ploidy.sum(), 974800)
    assert_allclose(gt.missing.sum(), 11657)

    mf.close()
    bgen.close()
示例#8
0
    def read_genotype(i: int):

        with bgen_metafile(metafile_filepath) as mf:
            part_size = mf.partition_size
            part = i // part_size
            j = i % part_size
            p = mf.read_partition(part)
            nsub_parts = _estimate_best_nsub_parts(nsamples, part_size)
            spart_size = max(1, part_size // nsub_parts)
            sub_part = j // spart_size
            m = j % spart_size
            start = sub_part * spart_size
            variants = p.variants
            end = min(variants.size, (sub_part + 1) * spart_size)
            vaddrs = tuple(p.variants.offset[start:end].tolist())
            g: List[Genotype] = read_genotype_partition(bgen_filepath, vaddrs)
            gm = g[m]
            return {
                "probs": gm.probability,
                "phased": gm.phased,
                "ploidy": gm.ploidy,
                "missing": gm.missing,
            }
示例#9
0
def read_bgen(
    filepath: Union[str, Path],
    metafile_filepath: Optional[Union[str, Path]] = None,
    samples_filepath: Optional[Union[str, Path]] = None,
    verbose: bool = True,
):
    """
    Read a given BGEN file.

    Parameters
    ----------
    filepath
        Bgen file path.
    metafile_filepath
        File path to the corresponding metafile. A metafile can be created by calling
        :func:`bgen_reader.create_metafile`. If ``None``, a metafile will be automatically created.
        Defaults to ``None``.
    samples_filepath
        Path to a `sample format`_ file or ``None`` to read samples from the bgen file itself.
        Defaults to ``None``.
    verbose
        ``True`` to show progress; ``False`` otherwise. Defaults to ``True``.

    Returns
    -------
    variants : :class:`dask.dataFrame.DataFrame`
        Variant position, chromosomes, rsids, etc.
    samples : :class:`pandas.Series`
        Sample identifications.
    genotype : list
        List of genotypes.

    Examples
    --------
    .. doctest::

        >>> from bgen_reader import example_filepath, read_bgen
        >>>
        >>> bgen = read_bgen(example_filepath("haplotypes.bgen"), verbose=False)
        >>> variants = bgen["variants"]
        >>> samples = bgen["samples"]
        >>>
        >>> v = variants.loc[0].compute()
        >>> g = bgen["genotype"][0].compute()
        >>> print(v)
             id rsid chrom  pos  nalleles allele_ids  vaddr
        0  SNP1  RS1     1    1         2        A,G    102
        >>> print(samples)
        0    sample_0
        1    sample_1
        2    sample_2
        3    sample_3
        Name: id, dtype: object
        >>> print(g["probs"][0])
        [1. 0. 1. 0.]

    .. _sample format: https://www.well.ox.ac.uk/~gav/qctool/documentation/sample_file_formats.html
    """

    filepath = Path(filepath)
    assert_file_exist(filepath)
    assert_file_readable(filepath)

    if metafile_filepath is None:
        metafile_filepath = infer_metafile_filepath(filepath)
    else:
        metafile_filepath = Path(metafile_filepath)
        assert_file_exist(metafile_filepath)
        assert_file_readable(filepath)

    if not metafile_filepath.exists():
        if verbose:
            print(
                f"We will create the metafile `{metafile_filepath}`. This file will "
                "speed up further\nreads and only need to be created once. So, please, "
                "bear with me.")
        create_metafile(filepath, metafile_filepath, verbose)
    elif os.path.getmtime(metafile_filepath) < os.path.getmtime(filepath):
        from ._genotype import cache as bgencache
        from ._metafile import cache as metacache

        metacache.clear()
        bgencache.clear()

        if verbose:
            print(
                f"File `{filepath}` has been modified after the creation of `{metafile_filepath}`."
                "\nWe will therefore recreate the metadata file. So, please, bear with me."
            )
        os.unlink(metafile_filepath)
        create_metafile(filepath, metafile_filepath, verbose)

    with bgen_file(filepath) as bgen:
        samples = _get_samples(bgen, samples_filepath, verbose)

        with bgen_metafile(metafile_filepath) as mf:
            nvariants = mf.nvariants
            npartitions = mf.npartitions
            part_size = mf.partition_size
            variants = create_variants(metafile_filepath, nvariants,
                                       npartitions, part_size)

        genotype = create_genotypes(bgen, metafile_filepath, verbose)

    return dict(variants=variants, samples=samples, genotype=genotype)
示例#10
0
def test_cbgen_phased_genotype(tmp_path):
    filepath = example.get("haplotypes.bgen")
    mfilepath = tmp_path / f"{filepath.name}.metafile"

    bgen = bgen_file(filepath)
    assert bgen.filepath.name == "haplotypes.bgen"
    assert bgen.nvariants == 4
    assert bgen.nsamples == 4
    assert bgen.contain_samples
    samples = bgen.read_samples()
    assert_array_equal(samples, [b"sample_0", b"sample_1", b"sample_2", b"sample_3"])

    bgen.create_metafile(mfilepath, verbose=False)

    mf = bgen_metafile(mfilepath)

    assert mf.filepath.name == mfilepath.name
    assert mf.npartitions == 1
    assert mf.nvariants == 4
    assert mf.partition_size == 4

    part = mf.read_partition(0)

    assert part.variants.id[0] == b"SNP1"
    assert part.variants.rsid[0] == b"RS1"
    assert part.variants.chrom[0] == b"1"
    assert part.variants.position[0] == 1
    assert part.variants.nalleles[0] == 2
    assert part.variants.allele_ids[0] == b"A,G"
    voff = part.variants.offset[0]
    gt = bgen.read_genotype(voff)
    assert_allclose(
        gt.probs,
        [
            [1.0, 0.0, 1.0, 0.0],
            [0.0, 1.0, 1.0, 0.0],
            [1.0, 0.0, 0.0, 1.0],
            [0.0, 1.0, 0.0, 1.0],
        ],
    )
    assert gt.phased
    assert_allclose(gt.ploidy, [2, 2, 2, 2])
    assert_allclose(gt.missing, [False, False, False, False])

    assert part.variants.id[3] == b"SNP4"
    assert part.variants.rsid[3] == b"RS4"
    assert part.variants.chrom[3] == b"1"
    assert part.variants.position[3] == 4
    assert part.variants.nalleles[3] == 2
    assert part.variants.allele_ids[3] == b"A,G"
    voff = part.variants.offset[3]
    gt = bgen.read_genotype(voff)
    assert_allclose(
        gt.probs,
        [
            [0.0, 1.0, 0.0, 1.0],
            [1.0, 0.0, 1.0, 0.0],
            [0.0, 1.0, 1.0, 0.0],
            [1.0, 0.0, 0.0, 1.0],
        ],
    )
    assert gt.phased
    assert_allclose(gt.ploidy, [2, 2, 2, 2])
    assert_allclose(gt.missing, [False, False, False, False])

    mf.close()
    bgen.close()
示例#11
0
def test_cbgen_complex_unphased(tmp_path: Path):
    filepath = example.get("complex.23bits.no.samples.bgen")
    mfilepath = tmp_path / f"{filepath.name}.metafile"
    with bgen_file(filepath) as bgen:
        assert bgen.filepath.name == "complex.23bits.no.samples.bgen"
        assert bgen.nvariants == 10
        assert bgen.nsamples == 4
        assert not bgen.contain_samples
        with pytest.raises(RuntimeError):
            bgen.read_samples()
        bgen.create_metafile(mfilepath, verbose=False)

    with bgen_metafile(mfilepath) as mf:
        assert mf.filepath.name == mfilepath.name
        assert mf.npartitions == 1
        assert mf.nvariants == 10
        assert mf.partition_size == 10

        part = mf.read_partition(0)

        assert part.variants.id[0] == b""
        assert part.variants.rsid[0] == b"V1"
        assert part.variants.chrom[0] == b"01"
        assert part.variants.position[0] == 1
        assert part.variants.nalleles[0] == 2
        assert part.variants.allele_ids[0] == b"A,G"

        with bgen_file(filepath) as bgen:
            voff = part.variants.offset[0]
            gt = bgen.read_genotype(voff)
            assert_allclose(
                gt.probs,
                [[1.0, 0.0, nan], [1.0, 0.0, 0.0], [1.0, 0.0, 0.0], [0.0, 1.0, 0.0]],
            )
            assert not gt.phased
            assert_allclose(gt.ploidy, [1, 2, 2, 2])
            assert_allclose(gt.missing, [False, False, False, False])

            voff = part.variants.offset[-1]
            gt = bgen.read_genotype(voff)
            assert_allclose(
                gt.probs,
                [
                    [1.0, 0.0, 0.0, 0.0, 0.0],
                    [0.0, 1.0, 0.0, 0.0, 0.0],
                    [0.0, 0.0, 1.0, 0.0, 0.0],
                    [0.0, 0.0, 0.0, 1.0, 0.0],
                ],
            )
            assert not gt.phased
            assert_allclose(gt.ploidy, [4, 4, 4, 4])
            assert_allclose(gt.missing, [False, False, False, False])

            valid_offsets = set(list(part.variants.offset))
            all_offsets = set(list(range(0, int(max(valid_offsets)) + 1)))
            invalid_offsets = all_offsets - valid_offsets

            for offset in list(invalid_offsets):
                with pytest.raises(RuntimeError):
                    bgen.read_genotype(offset)

        with pytest.raises(RuntimeError):
            part = mf.read_partition(1)
示例#12
0
文件: bench.py 项目: limix/cbgen
 def time_read_partitions(self):
     with cbgen.bgen_metafile(self._mfilepath) as mf:
         for i in range(mf.npartitions):
             mf.read_partition(i)
示例#13
0
文件: bench.py 项目: limix/cbgen
 def time_bgen_metafile(self):
     with cbgen.bgen_metafile(self._mfilepath):
         pass