class BedReader(object): def __init__(self, path, shape, dtype=np.int8, count_A1=True): # n variants (sid = SNP id), n samples (iid = Individual id) n_sid, n_iid = shape # Initialize Bed with empty arrays for axis data, otherwise it will # load the bim/map/fam files entirely into memory (it does not do out-of-core for those) self.bed = Bed( str(path), count_A1=count_A1, # Array (n_sample, 2) w/ FID and IID iid=np.empty((n_iid, 2), dtype="str"), # SNP id array (n_variants) sid=np.empty((n_sid, ), dtype="str"), # Contig and positions array (n_variants, 3) pos=np.empty((n_sid, 3), dtype="int"), ) self.shape = (n_sid, n_iid, 2) self.dtype = dtype self.ndim = 3 def __getitem__(self, idx): if not isinstance(idx, tuple): raise IndexError( # pragma: no cover f"Indexer must be tuple (received {type(idx)})") if len(idx) != self.ndim: raise IndexError( # pragma: no cover f"Indexer must be two-item tuple (received {len(idx)} slices)") # Slice using reversal of first two slices -- # pysnptools uses sample x variant orientation arr = self.bed[idx[1::-1]].read(dtype=np.float32, view_ok=False).val.T # Convert missing calls as nan to -1 arr = np.nan_to_num(arr, nan=-1.0) arr = arr.astype(self.dtype) # Add a ploidy dimension, so allele counts of 0, 1, 2 correspond to 00, 10, 11 arr = np.stack( [ np.where(arr < 0, -1, np.where(arr == 0, 0, 1)), np.where(arr < 0, -1, np.where(arr == 2, 1, 0)), ], axis=-1, ) # Apply final slice to 3D result return arr[:, :, idx[-1]] def close(self): # This is not actually crucial since a Bed instance with no # in-memory bim/map/fam data is essentially just a file pointer # but this will still be problematic if the an array is created # from the same PLINK dataset many times self.bed._close_bed() # pragma: no cover
class BedReader(object): def __init__(self, path, shape, dtype=np.int8, count_A1=True): from pysnptools.snpreader import Bed # n variants (sid = SNP id), n samples (iid = Individual id) n_sid, n_iid = shape # Initialize Bed with empty arrays for axis data, otherwise it will # load the bim/map/fam files entirely into memory (it does not do out-of-core for those) self.bed = Bed( str(path), count_A1=count_A1, # Array (n_sample, 2) w/ FID and IID iid=np.empty((n_iid, 2), dtype='str'), # SNP id array (n_variants) sid=np.empty((n_sid, ), dtype='str'), # Contig and positions array (n_variants, 3) pos=np.empty((n_sid, 3), dtype='int')) self.shape = (n_sid, n_iid) self.dtype = dtype self.ndim = 2 @staticmethod def _is_empty_slice(s): return s.start == s.stop def __getitem__(self, idx): if not isinstance(idx, tuple): raise IndexError(f'Indexer must be tuple (received {type(idx)})') if len(idx) != self.ndim: raise IndexError( f'Indexer must be two-item tuple (received {len(idx)} slices)') # This is called by dask with empty slices before trying to read any chunks, so it may need # to be handled separately if pysnptools is slow here # if all(map(BedReader._is_empty_slice, idx)): # return np.empty((0, 0), dtype=self.dtype) arr = self.bed[idx[::-1]].read(dtype=np.float32, view_ok=False).val.T arr = np.ma.masked_invalid(arr) arr = arr.astype(self.dtype) return arr def close(self): # This is not actually crucial since a Bed instance with no # in-memory bim/map/fam data is essentially just a file pointer # but this will still be problematic if the an array is created # from the same PLINK dataset many times self.bed._close_bed()