Exemplo n.º 1
0
class BedReader(object):
    def __init__(self, path, shape, dtype=np.int8, count_A1=True):
        # n variants (sid = SNP id), n samples (iid = Individual id)
        n_sid, n_iid = shape
        # Initialize Bed with empty arrays for axis data, otherwise it will
        # load the bim/map/fam files entirely into memory (it does not do out-of-core for those)
        self.bed = Bed(
            str(path),
            count_A1=count_A1,
            # Array (n_sample, 2) w/ FID and IID
            iid=np.empty((n_iid, 2), dtype="str"),
            # SNP id array (n_variants)
            sid=np.empty((n_sid, ), dtype="str"),
            # Contig and positions array (n_variants, 3)
            pos=np.empty((n_sid, 3), dtype="int"),
        )
        self.shape = (n_sid, n_iid, 2)
        self.dtype = dtype
        self.ndim = 3

    def __getitem__(self, idx):
        if not isinstance(idx, tuple):
            raise IndexError(  # pragma: no cover
                f"Indexer must be tuple (received {type(idx)})")
        if len(idx) != self.ndim:
            raise IndexError(  # pragma: no cover
                f"Indexer must be two-item tuple (received {len(idx)} slices)")
        # Slice using reversal of first two slices --
        # pysnptools uses sample x variant orientation
        arr = self.bed[idx[1::-1]].read(dtype=np.float32, view_ok=False).val.T
        # Convert missing calls as nan to -1
        arr = np.nan_to_num(arr, nan=-1.0)
        arr = arr.astype(self.dtype)
        # Add a ploidy dimension, so allele counts of 0, 1, 2 correspond to 00, 10, 11
        arr = np.stack(
            [
                np.where(arr < 0, -1, np.where(arr == 0, 0, 1)),
                np.where(arr < 0, -1, np.where(arr == 2, 1, 0)),
            ],
            axis=-1,
        )

        # Apply final slice to 3D result
        return arr[:, :, idx[-1]]

    def close(self):
        # This is not actually crucial since a Bed instance with no
        # in-memory bim/map/fam data is essentially just a file pointer
        # but this will still be problematic if the an array is created
        # from the same PLINK dataset many times
        self.bed._close_bed()  # pragma: no cover
class BedReader(object):
    def __init__(self, path, shape, dtype=np.int8, count_A1=True):
        from pysnptools.snpreader import Bed
        # n variants (sid = SNP id), n samples (iid = Individual id)
        n_sid, n_iid = shape
        # Initialize Bed with empty arrays for axis data, otherwise it will
        # load the bim/map/fam files entirely into memory (it does not do out-of-core for those)
        self.bed = Bed(
            str(path),
            count_A1=count_A1,
            # Array (n_sample, 2) w/ FID and IID
            iid=np.empty((n_iid, 2), dtype='str'),
            # SNP id array (n_variants)
            sid=np.empty((n_sid, ), dtype='str'),
            # Contig and positions array (n_variants, 3)
            pos=np.empty((n_sid, 3), dtype='int'))
        self.shape = (n_sid, n_iid)
        self.dtype = dtype
        self.ndim = 2

    @staticmethod
    def _is_empty_slice(s):
        return s.start == s.stop

    def __getitem__(self, idx):
        if not isinstance(idx, tuple):
            raise IndexError(f'Indexer must be tuple (received {type(idx)})')
        if len(idx) != self.ndim:
            raise IndexError(
                f'Indexer must be two-item tuple (received {len(idx)} slices)')

        # This is called by dask with empty slices before trying to read any chunks, so it may need
        # to be handled separately if pysnptools is slow here
        # if all(map(BedReader._is_empty_slice, idx)):
        #     return np.empty((0, 0), dtype=self.dtype)

        arr = self.bed[idx[::-1]].read(dtype=np.float32, view_ok=False).val.T
        arr = np.ma.masked_invalid(arr)
        arr = arr.astype(self.dtype)
        return arr

    def close(self):
        # This is not actually crucial since a Bed instance with no
        # in-memory bim/map/fam data is essentially just a file pointer
        # but this will still be problematic if the an array is created
        # from the same PLINK dataset many times
        self.bed._close_bed()