Пример #1
0
 def test_respect_inputs_DistData(self):
     np.random.seed(0)
     for dtype_start, decimal_start in [(np.float32, 5), (np.float64, 10)]:
         for order_start in ['F', 'C', 'A']:
             for sid_count in [20, 2]:
                 val = np.array(np.random.random(size=[3, sid_count, 3]),
                                dtype=dtype_start,
                                order=order_start)
                 val /= val.sum(axis=2,
                                keepdims=True)  #make probabilities sum to 1
                 distdataX = DistData(
                     iid=[["0", "0"], ["1", "1"], ["2", "2"]],
                     sid=[str(i) for i in range(sid_count)],
                     val=val)
                 for max_weight in [1.0, 2.0]:
                     weights = np.array([0, .5, 1]) * max_weight
                     for distreader0 in [distdataX, distdataX[:, 1:]]:
                         distreader1 = distreader0[1:, :]
                         refdata0 = distreader0.read()
                         refval0 = (refdata0.val * weights).sum(axis=-1)
                         for dtype_goal, decimal_goal in [(np.float32, 5),
                                                          (np.float64, 10)]:
                             for order_goal in ['F', 'C', 'A']:
                                 k = distreader0.as_snp(
                                     max_weight=max_weight,
                                     block_size=1).read(order=order_goal,
                                                        dtype=dtype_goal)
                                 DistData._array_properties_are_ok(
                                     k.val, order_goal, dtype_goal)
                                 np.testing.assert_array_almost_equal(
                                     refval0,
                                     k.val,
                                     decimal=min(decimal_start,
                                                 decimal_goal))
Пример #2
0
 def test_block_size_DistData(self):
     np.random.seed(0)
     sid_count = 20
     val=np.array(np.random.random(size=[3,sid_count,3]),dtype=np.float64,order='F')
     val /= val.sum(axis=2,keepdims=True)  #make probabilities sum to 1
     distreader = DistData(iid=[["0","0"],["1","1"],["2","2"]],sid=[str(i) for i in range(sid_count)],val=val)
     snpdata0 = distreader.as_snp(max_weight=100,block_size=1).read()
     snpdata1 = distreader.as_snp(max_weight=100,block_size=None).read()
     np.testing.assert_array_almost_equal(snpdata0.val,snpdata1.val, decimal=10)
Пример #3
0
    def _run_once(self):
        if (self._ran_once):
            return
        row_ascii, col_ascii, val, row_property, col_property = self._run_once_inner(
        )
        row = np.array(row_ascii,
                       dtype='str')  #!!!avoid this copy when not needed
        col = np.array(col_ascii,
                       dtype='str')  #!!!avoid this copy when not needed

        DistData.__init__(self,
                          iid=row,
                          sid=col,
                          val=val,
                          pos=col_property,
                          name="np.memmap('{0}')".format(self._filename))
Пример #4
0
    def read(self, order='F', dtype=np.float64, force_python_only=False, view_ok=False, num_threads=None):
        """Reads the SNP values and returns a :class:`.DistData` (with :attr:`DistData.val` property containing a new 3D ndarray of the SNP distribution values).

        :param order: {'F' (default), 'C', 'A'}, optional -- Specify the order of the ndarray. If order is 'F' (default),
            then the array will be in F-contiguous order (iid-index varies the fastest).
            If order is 'C', then the returned array will be in C-contiguous order (sid-index varies the fastest).
            If order is 'A', then the :attr:`DistData.val`
            ndarray may be in any order (either C-, Fortran-contiguous).
        :type order: string or None

        :param dtype: {numpy.float64 (default), numpy.float32}, optional -- The data-type for the :attr:`DistData.val` ndarray.
        :type dtype: data-type

        :param force_python_only: optional -- If False (default), may use outside library code. If True, requests that the read
            be done without outside library code.
        :type force_python_only: bool

        :param view_ok: optional -- If False (default), allocates new memory for the :attr:`DistData.val`'s ndarray. If True,
            if practical and reading from a :class:`DistData`, will return a new 
            :class:`DistData` with a ndarray shares memory with the original :class:`DistData`.
            Typically, you'll also wish to use "order='A'" to increase the chance that sharing will be possible.
            Use these parameters with care because any change to either ndarraywill effect
            the others. Also keep in mind that :meth:`read` relies on ndarray's mechanisms to decide whether to actually
            share memory and so it may ignore your suggestion and allocate a new ndarray anyway.
        :type view_ok: bool

        :param num_threads: optional -- The number of threads with which to read data. Defaults to all available
            processors. Can also be set with these environment variables (listed in priority order):
            'PST_NUM_THREADS', 'NUM_THREADS', 'MKL_NUM_THREADS'.
        :type num_threads: None or int

        :rtype: :class:`.DistData`

        Calling the method again causes the SNP distribution values to be re-read and creates a new in-memory :class:`.DistData` with a new ndarray of SNP values.

        If you request the values for only a subset of the sids or iids, (to the degree practical) only that subset will be read from disk.

        :Example:

        >>> from pysnptools.distreader import Bgen
        >>> from pysnptools.util import example_file # Download and return local file name
        >>> bgen_file = example_file("pysnptools/examples/2500x100.bgen")
        >>> dist_on_disk = Bgen(bgen_file) # Specify SNP data on disk
        >>> distdata1 = dist_on_disk.read() # Read all the SNP data returning a DistData instance
        >>> print(type(distdata1.val).__name__) # The DistData instance contains a ndarray of the data.
        ndarray
        >>> subset_distdata = dist_on_disk[:,::2].read() # From the disk, read SNP values for every other sid
        >>> print(subset_distdata.val[0,0]) # Print the first SNP value in the subset
        [0.466804   0.38812848 0.14506752]
        >>> subsub_distdata = subset_distdata[:10,:].read(order='A',view_ok=True) # Create an in-memory subset of the subset with SNP values for the first ten iids. Share memory if practical.
        >>> import numpy as np
        >>> # print np.may_share_memory(subset_distdata.val, subsub_distdata.val) # Do the two ndarray's share memory? They could. Currently they won't.       
        """
        dtype = np.dtype(dtype)
        val = self._read(None, None, order, dtype, force_python_only, view_ok, num_threads)
        from pysnptools.distreader import DistData
        ret = DistData(self.iid,self.sid,val,pos=self.pos,name=str(self))
        return ret
Пример #5
0
 def test_3d(self):
     from pysnptools.distreader import DistData
     np.random.seed(0)
     row_count = 4
     col_count = 5
     val_shape = 3
     val = np.random.random((row_count,col_count,val_shape))
     val /= val.sum(axis=2,keepdims=True)  #make probabilities sum to 1
     distdata = DistData(val=val,iid=[['iid{0}'.format(i)]*2 for i in range(row_count)],sid=['sid{0}'.format(s) for s in range(col_count)]
                         )
Пример #6
0
    def test2(self):
        from pysnptools.distreader import Bgen

        old_dir = os.getcwd()
        os.chdir(os.path.dirname(os.path.realpath(__file__)))

        bgen = Bgen('../examples/example.bgen')
        distmemmap = DistMemMap.write("tempdir/bgentomemmap.dist.memamp", bgen)
        assert DistData.allclose(bgen.read(),
                                 distmemmap.read(),
                                 equal_nan=True)
        os.chdir(old_dir)
Пример #7
0
if __name__ == "__main__":
    logging.getLogger().setLevel(logging.INFO)

    if False:
        os.chdir(r'D:\OneDrive\programs\pstsgkit\doc\ipynb')
        from pysnptools.distreader import DistData
        iid = [('0', 'iid0'), ('0', 'iid1')]
        sid = ['snp0', 'snp1', 'snp2']
        pos = [[1, 0, 1], [1, 0, 2],
               [1, 0, 3]]  #chromosome, genetic distance, basepair distance
        val = np.array([[[0, 0, 1], [.5, .25, .25], [.95, .05, 0]],
                        [[1, 0, 0], [44, 44, 44], [np.nan, np.nan, np.nan]]],
                       dtype='float32')
        distdata = DistData(iid=iid,
                            sid=sid,
                            pos=pos,
                            val=val,
                            name='in-memory sample')
        distdata.val /= distdata.val.sum(axis=2, keepdims=True)
        # if you ask try to read a file that isn't there, do you get a sensible error?
        bgen = Bgen.write('2x3sample13.bgen', distdata, bits=23)  #write it
        bgen.read(dtype='float32').val  #Read the data from disk

    if False:

        import tracemalloc
        import logging
        import time

        logging.basicConfig(level=logging.INFO)
        tracemalloc.start()
Пример #8
0
    def test_writes(self):
        from pysnptools.distreader import DistData, DistHdf5, DistNpz, DistMemMap, Bgen
        from pysnptools.kernelreader.test import _fortesting_JustCheckExists

        the_class_and_suffix_list = [(DistNpz,"npz",None,None),
                                     (Bgen,"bgen",None,lambda filename,distdata: Bgen.write(filename,distdata,bits=32)),
                                     (DistHdf5,"hdf5",None,None),
                                     (DistMemMap,"memmap",None,None)]
        cant_do_col_prop_none_set = {'bgen'}
        cant_do_col_len_0_set = {'bgen'}
        cant_do_row_count_zero_set = {'bgen'}
        can_swap_0_2_set = {}
        can_change_col_names_set = {}
        ignore_fam_id_set = {}
        ignore_pos1_set = {'bgen'}
        ignore_pos_set = {}
        erase_any_write_dir = {}

        
        #===================================
        #    Starting main function
        #===================================
        logging.info("starting 'test_writes'")
        np.random.seed(0)
        output_template = "tempdir/distreader/writes.{0}.{1}"
        create_directory_if_necessary(output_template.format(0,"npz"))
        i = 0
        for row_count in [0,5,2,1]:
            for col_count in [4,2,1,0]:
                val=np.random.random(size=[row_count,col_count,3])
                val /= val.sum(axis=2,keepdims=True)  #make probabilities sum to 1

                val[val==3]=np.NaN
                row = [('0','0'),('1','1'),('2','2'),('3','3'),('4','4')][:row_count]
                col = ['s0','s1','s2','s3','s4'][:col_count]
                for is_none in [True,False]:
                    row_prop = None
                    col_prop = None if is_none else [(x,x,x) for x in range(5)][:col_count]
                    distdata = DistData(iid=row,sid=col,val=val,pos=col_prop,name=str(i))
                    for the_class,suffix,constructor,writer in the_class_and_suffix_list:
                        constructor = constructor or (lambda filename: the_class(filename))
                        writer = writer or (lambda filename,distdata: the_class.write(filename,distdata))
                        if col_count == 0 and suffix in cant_do_col_len_0_set:
                            continue
                        if col_prop is None and suffix in cant_do_col_prop_none_set:
                            continue
                        if row_count==0 and suffix in cant_do_row_count_zero_set:
                            continue
                        filename = output_template.format(i,suffix)
                        logging.info(filename)
                        i += 1
                        if suffix in erase_any_write_dir and os.path.exists(filename):
                            shutil.rmtree(filename)
                        ret = writer(filename,distdata)
                        assert ret is not None
                        for subsetter in [None, np.s_[::2,::3]]:
                            reader = constructor(filename)
                            _fortesting_JustCheckExists().input(reader)
                            subreader = reader if subsetter is None else reader[subsetter[0],subsetter[1]]
                            readdata = subreader.read(order='C')
                            expected = distdata if subsetter is None else distdata[subsetter[0],subsetter[1]].read()
                            if not suffix in can_swap_0_2_set:
                                assert np.allclose(readdata.val,expected.val,equal_nan=True)
                            else:
                                for col_index in range(readdata.col_count):
                                    assert (np.allclose(readdata.val[:,col_index],expected.val[:,col_index],equal_nan=True) or
                                            np.allclose(readdata.val[:,col_index]*-1+2,expected.val[:,col_index],equal_nan=True))
                            if not suffix in ignore_fam_id_set:
                                assert np.array_equal(readdata.row,expected.row)
                            else:
                                assert np.array_equal(readdata.row[:,1],expected.row[:,1])
                            if not suffix in can_change_col_names_set:
                                assert np.array_equal(readdata.col,expected.col)
                            else:
                                assert readdata.col_count==expected.col_count
                            assert np.array_equal(readdata.row_property,expected.row_property) or (readdata.row_property.shape[1]==0 and expected.row_property.shape[1]==0)

                            if suffix in ignore_pos1_set:
                                assert np.allclose(readdata.col_property[:,[0,2]],expected.col_property[:,[0,2]],equal_nan=True) or (readdata.col_property.shape[1]==0 and expected.col_property.shape[1]==0)
                            elif not suffix in ignore_pos_set:
                                assert np.allclose(readdata.col_property,expected.col_property,equal_nan=True) or (readdata.col_property.shape[1]==0 and expected.col_property.shape[1]==0)
                            else:
                                assert len(readdata.col_property)==len(expected.col_property)
                        try:
                            os.remove(filename)
                        except:
                            pass
        logging.info("done with 'test_writes'")
Пример #9
0
    return test_suite

if __name__ == '__main__':
    logging.basicConfig(level=logging.WARN)

    if False:
        from pysnptools.snpreader import Bed
        from pysnptools.distreader import DistData, DistNpz
        # Create toydata.dist.npz
        currentFolder = os.path.dirname(os.path.realpath(__file__))
        if True:
            snpreader = Bed(currentFolder + "/../examples/toydata.5chrom.bed",count_A1=True)[:25,:]
            np.random.seed(392)
            val = np.random.random((snpreader.iid_count,snpreader.sid_count,3))
            val /= val.sum(axis=2,keepdims=True)  #make probabilities sum to 1
            distdata = DistData(iid=snpreader.iid,sid=snpreader.sid,pos=snpreader.pos,val=val)
            DistNpz.write(currentFolder + "/../examples/toydata.dist.npz",distdata)
        if True:
            distdata = DistNpz(currentFolder + "/../examples/toydata.dist.npz").read()
            for sid_major,name_bit in [(False,'iidmajor'),(True,'snpmajor')]:
                DistHdf5.write(currentFolder + "/../examples/toydata.{0}.dist.hdf5".format(name_bit),distdata,sid_major=sid_major)
        if True:
            distdata = DistNpz(currentFolder + "/../examples/toydata.dist.npz")[:,:10].read()
            DistNpz.write(currentFolder + "/../examples/toydata10.dist.npz",distdata)
        if True:
            distdata = DistNpz(currentFolder + "/../examples/toydata.dist.npz")[:,:10].read()
            DistMemMap.write(currentFolder + "/../examples/tiny.dist.memmap",distdata)
        print('done')

    suites = getTestSuite()
    r = unittest.TextTestRunner(failfast=False)