def test_respect_inputs_DistData(self): np.random.seed(0) for dtype_start, decimal_start in [(np.float32, 5), (np.float64, 10)]: for order_start in ['F', 'C', 'A']: for sid_count in [20, 2]: val = np.array(np.random.random(size=[3, sid_count, 3]), dtype=dtype_start, order=order_start) val /= val.sum(axis=2, keepdims=True) #make probabilities sum to 1 distdataX = DistData( iid=[["0", "0"], ["1", "1"], ["2", "2"]], sid=[str(i) for i in range(sid_count)], val=val) for max_weight in [1.0, 2.0]: weights = np.array([0, .5, 1]) * max_weight for distreader0 in [distdataX, distdataX[:, 1:]]: distreader1 = distreader0[1:, :] refdata0 = distreader0.read() refval0 = (refdata0.val * weights).sum(axis=-1) for dtype_goal, decimal_goal in [(np.float32, 5), (np.float64, 10)]: for order_goal in ['F', 'C', 'A']: k = distreader0.as_snp( max_weight=max_weight, block_size=1).read(order=order_goal, dtype=dtype_goal) DistData._array_properties_are_ok( k.val, order_goal, dtype_goal) np.testing.assert_array_almost_equal( refval0, k.val, decimal=min(decimal_start, decimal_goal))
def test_block_size_DistData(self): np.random.seed(0) sid_count = 20 val=np.array(np.random.random(size=[3,sid_count,3]),dtype=np.float64,order='F') val /= val.sum(axis=2,keepdims=True) #make probabilities sum to 1 distreader = DistData(iid=[["0","0"],["1","1"],["2","2"]],sid=[str(i) for i in range(sid_count)],val=val) snpdata0 = distreader.as_snp(max_weight=100,block_size=1).read() snpdata1 = distreader.as_snp(max_weight=100,block_size=None).read() np.testing.assert_array_almost_equal(snpdata0.val,snpdata1.val, decimal=10)
def _run_once(self): if (self._ran_once): return row_ascii, col_ascii, val, row_property, col_property = self._run_once_inner( ) row = np.array(row_ascii, dtype='str') #!!!avoid this copy when not needed col = np.array(col_ascii, dtype='str') #!!!avoid this copy when not needed DistData.__init__(self, iid=row, sid=col, val=val, pos=col_property, name="np.memmap('{0}')".format(self._filename))
def read(self, order='F', dtype=np.float64, force_python_only=False, view_ok=False, num_threads=None): """Reads the SNP values and returns a :class:`.DistData` (with :attr:`DistData.val` property containing a new 3D ndarray of the SNP distribution values). :param order: {'F' (default), 'C', 'A'}, optional -- Specify the order of the ndarray. If order is 'F' (default), then the array will be in F-contiguous order (iid-index varies the fastest). If order is 'C', then the returned array will be in C-contiguous order (sid-index varies the fastest). If order is 'A', then the :attr:`DistData.val` ndarray may be in any order (either C-, Fortran-contiguous). :type order: string or None :param dtype: {numpy.float64 (default), numpy.float32}, optional -- The data-type for the :attr:`DistData.val` ndarray. :type dtype: data-type :param force_python_only: optional -- If False (default), may use outside library code. If True, requests that the read be done without outside library code. :type force_python_only: bool :param view_ok: optional -- If False (default), allocates new memory for the :attr:`DistData.val`'s ndarray. If True, if practical and reading from a :class:`DistData`, will return a new :class:`DistData` with a ndarray shares memory with the original :class:`DistData`. Typically, you'll also wish to use "order='A'" to increase the chance that sharing will be possible. Use these parameters with care because any change to either ndarraywill effect the others. Also keep in mind that :meth:`read` relies on ndarray's mechanisms to decide whether to actually share memory and so it may ignore your suggestion and allocate a new ndarray anyway. :type view_ok: bool :param num_threads: optional -- The number of threads with which to read data. Defaults to all available processors. Can also be set with these environment variables (listed in priority order): 'PST_NUM_THREADS', 'NUM_THREADS', 'MKL_NUM_THREADS'. :type num_threads: None or int :rtype: :class:`.DistData` Calling the method again causes the SNP distribution values to be re-read and creates a new in-memory :class:`.DistData` with a new ndarray of SNP values. If you request the values for only a subset of the sids or iids, (to the degree practical) only that subset will be read from disk. :Example: >>> from pysnptools.distreader import Bgen >>> from pysnptools.util import example_file # Download and return local file name >>> bgen_file = example_file("pysnptools/examples/2500x100.bgen") >>> dist_on_disk = Bgen(bgen_file) # Specify SNP data on disk >>> distdata1 = dist_on_disk.read() # Read all the SNP data returning a DistData instance >>> print(type(distdata1.val).__name__) # The DistData instance contains a ndarray of the data. ndarray >>> subset_distdata = dist_on_disk[:,::2].read() # From the disk, read SNP values for every other sid >>> print(subset_distdata.val[0,0]) # Print the first SNP value in the subset [0.466804 0.38812848 0.14506752] >>> subsub_distdata = subset_distdata[:10,:].read(order='A',view_ok=True) # Create an in-memory subset of the subset with SNP values for the first ten iids. Share memory if practical. >>> import numpy as np >>> # print np.may_share_memory(subset_distdata.val, subsub_distdata.val) # Do the two ndarray's share memory? They could. Currently they won't. """ dtype = np.dtype(dtype) val = self._read(None, None, order, dtype, force_python_only, view_ok, num_threads) from pysnptools.distreader import DistData ret = DistData(self.iid,self.sid,val,pos=self.pos,name=str(self)) return ret
def test_3d(self): from pysnptools.distreader import DistData np.random.seed(0) row_count = 4 col_count = 5 val_shape = 3 val = np.random.random((row_count,col_count,val_shape)) val /= val.sum(axis=2,keepdims=True) #make probabilities sum to 1 distdata = DistData(val=val,iid=[['iid{0}'.format(i)]*2 for i in range(row_count)],sid=['sid{0}'.format(s) for s in range(col_count)] )
def test2(self): from pysnptools.distreader import Bgen old_dir = os.getcwd() os.chdir(os.path.dirname(os.path.realpath(__file__))) bgen = Bgen('../examples/example.bgen') distmemmap = DistMemMap.write("tempdir/bgentomemmap.dist.memamp", bgen) assert DistData.allclose(bgen.read(), distmemmap.read(), equal_nan=True) os.chdir(old_dir)
if __name__ == "__main__": logging.getLogger().setLevel(logging.INFO) if False: os.chdir(r'D:\OneDrive\programs\pstsgkit\doc\ipynb') from pysnptools.distreader import DistData iid = [('0', 'iid0'), ('0', 'iid1')] sid = ['snp0', 'snp1', 'snp2'] pos = [[1, 0, 1], [1, 0, 2], [1, 0, 3]] #chromosome, genetic distance, basepair distance val = np.array([[[0, 0, 1], [.5, .25, .25], [.95, .05, 0]], [[1, 0, 0], [44, 44, 44], [np.nan, np.nan, np.nan]]], dtype='float32') distdata = DistData(iid=iid, sid=sid, pos=pos, val=val, name='in-memory sample') distdata.val /= distdata.val.sum(axis=2, keepdims=True) # if you ask try to read a file that isn't there, do you get a sensible error? bgen = Bgen.write('2x3sample13.bgen', distdata, bits=23) #write it bgen.read(dtype='float32').val #Read the data from disk if False: import tracemalloc import logging import time logging.basicConfig(level=logging.INFO) tracemalloc.start()
def test_writes(self): from pysnptools.distreader import DistData, DistHdf5, DistNpz, DistMemMap, Bgen from pysnptools.kernelreader.test import _fortesting_JustCheckExists the_class_and_suffix_list = [(DistNpz,"npz",None,None), (Bgen,"bgen",None,lambda filename,distdata: Bgen.write(filename,distdata,bits=32)), (DistHdf5,"hdf5",None,None), (DistMemMap,"memmap",None,None)] cant_do_col_prop_none_set = {'bgen'} cant_do_col_len_0_set = {'bgen'} cant_do_row_count_zero_set = {'bgen'} can_swap_0_2_set = {} can_change_col_names_set = {} ignore_fam_id_set = {} ignore_pos1_set = {'bgen'} ignore_pos_set = {} erase_any_write_dir = {} #=================================== # Starting main function #=================================== logging.info("starting 'test_writes'") np.random.seed(0) output_template = "tempdir/distreader/writes.{0}.{1}" create_directory_if_necessary(output_template.format(0,"npz")) i = 0 for row_count in [0,5,2,1]: for col_count in [4,2,1,0]: val=np.random.random(size=[row_count,col_count,3]) val /= val.sum(axis=2,keepdims=True) #make probabilities sum to 1 val[val==3]=np.NaN row = [('0','0'),('1','1'),('2','2'),('3','3'),('4','4')][:row_count] col = ['s0','s1','s2','s3','s4'][:col_count] for is_none in [True,False]: row_prop = None col_prop = None if is_none else [(x,x,x) for x in range(5)][:col_count] distdata = DistData(iid=row,sid=col,val=val,pos=col_prop,name=str(i)) for the_class,suffix,constructor,writer in the_class_and_suffix_list: constructor = constructor or (lambda filename: the_class(filename)) writer = writer or (lambda filename,distdata: the_class.write(filename,distdata)) if col_count == 0 and suffix in cant_do_col_len_0_set: continue if col_prop is None and suffix in cant_do_col_prop_none_set: continue if row_count==0 and suffix in cant_do_row_count_zero_set: continue filename = output_template.format(i,suffix) logging.info(filename) i += 1 if suffix in erase_any_write_dir and os.path.exists(filename): shutil.rmtree(filename) ret = writer(filename,distdata) assert ret is not None for subsetter in [None, np.s_[::2,::3]]: reader = constructor(filename) _fortesting_JustCheckExists().input(reader) subreader = reader if subsetter is None else reader[subsetter[0],subsetter[1]] readdata = subreader.read(order='C') expected = distdata if subsetter is None else distdata[subsetter[0],subsetter[1]].read() if not suffix in can_swap_0_2_set: assert np.allclose(readdata.val,expected.val,equal_nan=True) else: for col_index in range(readdata.col_count): assert (np.allclose(readdata.val[:,col_index],expected.val[:,col_index],equal_nan=True) or np.allclose(readdata.val[:,col_index]*-1+2,expected.val[:,col_index],equal_nan=True)) if not suffix in ignore_fam_id_set: assert np.array_equal(readdata.row,expected.row) else: assert np.array_equal(readdata.row[:,1],expected.row[:,1]) if not suffix in can_change_col_names_set: assert np.array_equal(readdata.col,expected.col) else: assert readdata.col_count==expected.col_count assert np.array_equal(readdata.row_property,expected.row_property) or (readdata.row_property.shape[1]==0 and expected.row_property.shape[1]==0) if suffix in ignore_pos1_set: assert np.allclose(readdata.col_property[:,[0,2]],expected.col_property[:,[0,2]],equal_nan=True) or (readdata.col_property.shape[1]==0 and expected.col_property.shape[1]==0) elif not suffix in ignore_pos_set: assert np.allclose(readdata.col_property,expected.col_property,equal_nan=True) or (readdata.col_property.shape[1]==0 and expected.col_property.shape[1]==0) else: assert len(readdata.col_property)==len(expected.col_property) try: os.remove(filename) except: pass logging.info("done with 'test_writes'")
return test_suite if __name__ == '__main__': logging.basicConfig(level=logging.WARN) if False: from pysnptools.snpreader import Bed from pysnptools.distreader import DistData, DistNpz # Create toydata.dist.npz currentFolder = os.path.dirname(os.path.realpath(__file__)) if True: snpreader = Bed(currentFolder + "/../examples/toydata.5chrom.bed",count_A1=True)[:25,:] np.random.seed(392) val = np.random.random((snpreader.iid_count,snpreader.sid_count,3)) val /= val.sum(axis=2,keepdims=True) #make probabilities sum to 1 distdata = DistData(iid=snpreader.iid,sid=snpreader.sid,pos=snpreader.pos,val=val) DistNpz.write(currentFolder + "/../examples/toydata.dist.npz",distdata) if True: distdata = DistNpz(currentFolder + "/../examples/toydata.dist.npz").read() for sid_major,name_bit in [(False,'iidmajor'),(True,'snpmajor')]: DistHdf5.write(currentFolder + "/../examples/toydata.{0}.dist.hdf5".format(name_bit),distdata,sid_major=sid_major) if True: distdata = DistNpz(currentFolder + "/../examples/toydata.dist.npz")[:,:10].read() DistNpz.write(currentFolder + "/../examples/toydata10.dist.npz",distdata) if True: distdata = DistNpz(currentFolder + "/../examples/toydata.dist.npz")[:,:10].read() DistMemMap.write(currentFolder + "/../examples/tiny.dist.memmap",distdata) print('done') suites = getTestSuite() r = unittest.TextTestRunner(failfast=False)