Пример #1
0
    def test_some_std(self):
        k0 = self.snpdata.read_kernel(standardizer=Unit()).val
        from pysnptools.kernelreader import SnpKernel
        k1 = self.snpdata.read_kernel(standardizer=Unit())
        np.testing.assert_array_almost_equal(k0, k1.val, decimal=10)

        from pysnptools.snpreader import SnpData
        snpdata2 = SnpData(iid=self.snpdata.iid,
                           sid=self.snpdata.sid,
                           pos=self.snpdata.pos,
                           val=np.array(self.snpdata.val))
        s = str(snpdata2)
        snpdata2.standardize()
        s = str(snpdata2)

        snpreader = Bed(self.currentFolder + "/examples/toydata",
                        count_A1=False)
        k2 = snpreader.read_kernel(standardizer=Unit(), block_size=500).val
        np.testing.assert_array_almost_equal(k0, k2, decimal=10)

        from pysnptools.standardizer.identity import Identity
        from pysnptools.standardizer.diag_K_to_N import DiagKtoN
        for dtype in [sp.float64, sp.float32]:
            for std in [Unit(), Beta(1, 25), Identity(), DiagKtoN()]:
                s = str(std)
                np.random.seed(0)
                x = np.array(np.random.randint(3, size=[60, 100]), dtype=dtype)
                x2 = x[:, ::2]
                x2b = np.array(x2)
                #LATER what's this about? It doesn't do non-contiguous?
                #assert not x2.flags['C_CONTIGUOUS'] and not x2.flags['F_CONTIGUOUS'] #set up to test non contiguous
                #assert x2b.flags['C_CONTIGUOUS'] or x2b.flags['F_CONTIGUOUS'] #set up to test non contiguous
                #a,b = std.standardize(x2b),std.standardize(x2)
                #np.testing.assert_array_almost_equal(a,b)
        logging.info("done")
Пример #2
0
 def _read_pstdata(self):
     row = SnpReader._read_fam(self.filename, remove_suffix="dat")
     col, col_property = SnpReader._read_map_or_bim(self.filename,
                                                    remove_suffix="dat",
                                                    add_suffix="map")
     if len(row) == 0 or len(col) == 0:
         return SnpData(iid=row,
                        sid=col,
                        pos=col_property,
                        val=np.empty([len(row), len(col)]))
     datfields = pd.read_csv(self.filename,
                             delimiter='\t',
                             header=None,
                             index_col=False,
                             skiprows=self.skiprows)
     if not np.array_equal(datfields[0], col):
         raise Exception(
             "Expect snp list in map file to exactly match snp list in dat file"
         )
     del datfields[0]
     del datfields[1]
     del datfields[2]
     assert len(row) == datfields.shape[
         1], "Expect # iids in fam file to match dat file"
     val = datfields.values.T
     snpdata = SnpData(iid=row, sid=col, pos=col_property, val=val)
     return snpdata
Пример #3
0
    def test_some_std(self):
        k0 = self.snpdata.read_kernel(standardizer=Unit()).val
        from pysnptools.kernelreader import SnpKernel
        k1 = self.snpdata.read_kernel(standardizer=Unit())
        np.testing.assert_array_almost_equal(k0, k1.val, decimal=10)

        from pysnptools.snpreader import SnpData
        snpdata2 = SnpData(iid=self.snpdata.iid,sid=self.snpdata.sid,pos=self.snpdata.pos,val=np.array(self.snpdata.val))
        s = str(snpdata2)
        snpdata2.standardize()
        s = str(snpdata2)

        snpreader = Bed(self.currentFolder + "/examples/toydata",count_A1=False)
        k2 = snpreader.read_kernel(standardizer=Unit(),block_size=500).val
        np.testing.assert_array_almost_equal(k0, k2, decimal=10)

        from pysnptools.standardizer.identity import Identity
        from pysnptools.standardizer.diag_K_to_N import DiagKtoN
        for dtype in [sp.float64,sp.float32]:
            for std in [Unit(),Beta(1,25),Identity(),DiagKtoN()]:
                s = str(std)
                np.random.seed(0)
                x = np.array(np.random.randint(3,size=[60,100]),dtype=dtype)
                x2 = x[:,::2]
                x2b = np.array(x2)
                #LATER what's this about? It doesn't do non-contiguous?
                #assert not x2.flags['C_CONTIGUOUS'] and not x2.flags['F_CONTIGUOUS'] #set up to test non contiguous
                #assert x2b.flags['C_CONTIGUOUS'] or x2b.flags['F_CONTIGUOUS'] #set up to test non contiguous
                #a,b = std.standardize(x2b),std.standardize(x2)
                #np.testing.assert_array_almost_equal(a,b)
        logging.info("done")
Пример #4
0
    def predict(self,
                X=None,
                K0_whole_test=None,
                K1_whole_test=None,
                iid_if_none=None):
        """
        Method for predicting from a fitted :class:`FastLMM` predictor.
        If the examples in X, K0_whole_test, K1_whole_test are not the same, they will be reordered and intersected.

        :param X: testing covariate information, optional: 
          If you give a string, it should be the file name of a PLINK phenotype-formatted file.
        :type X: a PySnpTools :class:`SnpReader` (such as :class:`Pheno` or :class:`SnpData`) or string.

        :param K0_whole_test: Must be None. Represents the identity similarity matrix.
        :type K0_whole_test: None

        :param K1_whole_test: Must be None. Represents the identity similarity matrix.
        :type K1_whole_test: :class:`.SnpReader` or a string or :class:`.KernelReader`

        :param iid_if_none: Examples to predict for if no X, K0_whole_test, K1_whole_test is provided.
        :type iid_if_none: an ndarray of two strings

        :rtype: A :class:`SnpData` of the means and a :class:`KernelData` of the covariance
        """

        assert self.is_fitted, "Can only predict after predictor has been fitted"
        assert K0_whole_test is None or isinstance(
            K0_whole_test, KernelIdentity)  # could also accept no snps
        assert K1_whole_test is None or isinstance(
            K1_whole_test, KernelIdentity)  # could also accept no snps

        X = _pheno_fixup(X, iid_if_none=iid_if_none)
        X = X.read().standardize(self.covar_unit_trained)

        # add a column of 1's to cov to increase DOF of model (and accuracy) by allowing a constant offset
        X = SnpData(iid=X.iid,
                    sid=FastLMM._new_snp_name(X),
                    val=np.c_[X.read().val,
                              np.ones((X.iid_count, 1))])
        assert np.array_equal(
            X.sid, self.covar_sid
        ), "Expect covar sids to be the same in train and test."

        pheno_predicted = X.val.dot(self.beta).reshape(-1, 1)
        ret0 = SnpData(iid=X.iid,
                       sid=self.pheno_sid,
                       val=pheno_predicted,
                       pos=np.array([[np.nan, np.nan, np.nan]]),
                       name="linear regression Prediction"
                       )  #!!!replace 'parent_string' with 'name'

        from pysnptools.kernelreader import KernelData
        ret1 = KernelData(iid=X.iid,
                          val=np.eye(X.iid_count) * self.ssres /
                          self.iid_count)
        return ret0, ret1
Пример #5
0
 def test_block_size_Snp2Dist(self):
     from pysnptools.snpreader import SnpData
     from pysnptools.distreader._snp2dist import _Snp2Dist
     np.random.seed(0)
     sid_count = 20
     val=np.array(np.random.randint(0,3,size=[3,sid_count]),dtype=np.float64,order='F')
     snpreader = SnpData(iid=[["0","0"],["1","1"],["2","2"]],sid=[str(i) for i in range(sid_count)],val=val)
     distdata0 = snpreader.as_dist(max_weight=2,block_size=1).read()
     distdata1 = snpreader.as_dist(max_weight=2,block_size=None).read()
     np.testing.assert_array_almost_equal(distdata0.val,distdata1.val, decimal=10)
Пример #6
0
def snpsA(seed, iid_count, sid_count, use_distributed):
    import numpy as np
    from pysnptools.snpreader import Bed
    from pysnptools.snpreader import DistributedBed
    from pysnptools.snpreader import SnpGen

    chrom_count = 10
    global top_cache
    if use_distributed:
        test_snp_path = (
            cache_top /
            f"snpsA_{seed}_{chrom_count}_{iid_count}_{sid_count}_db")
    else:
        test_snp_path = (
            cache_top /
            f"snpsA_{seed}_{chrom_count}_{iid_count}_{sid_count}.bed")
    count_A1 = False
    if not test_snp_path.exists():
        snpgen = SnpGen(
            seed=seed,
            iid_count=iid_count,
            sid_count=sid_count,
            chrom_count=chrom_count,
            block_size=1000,
        )
        if use_distributed:
            test_snps = DistributedBed.write(str(test_snp_path), snpgen)
        else:
            test_snps = Bed.write(str(test_snp_path),
                                  snpgen.read(dtype="float32"),
                                  count_A1=count_A1)
    else:
        if use_distributed:
            test_snps = DistributedBed(str(test_snp_path))
        else:
            test_snps = Bed(str(test_snp_path), count_A1=count_A1)
    from pysnptools.snpreader import SnpData

    np.random.seed(seed)
    pheno = SnpData(
        iid=test_snps.iid,
        sid=["pheno"],
        val=np.random.randn(test_snps.iid_count, 1) * 3 + 2,
    )
    covar = SnpData(
        iid=test_snps.iid,
        sid=["covar1", "covar2"],
        val=np.random.randn(test_snps.iid_count, 2) * 2 - 3,
    )

    return test_snps, pheno, covar
Пример #7
0
    def test_cpp_std(self):

        #Order C vs F
        for order in ['C','F']:
            #32 vs 64
            for dtype in [np.float64,np.float32]:
                #unit vs beta
                for std in [stdizer.Unit(),stdizer.Beta(2,10)]:
                        np.random.seed(0)
                        snp_count = 20
                        snpreader0 = SnpData(iid=[["0","0"],["1","1"],["2","2"]],sid=[str(i) for i in range(snp_count)],val=np.array(np.random.randint(3,size=[3,snp_count]),dtype=dtype,order=order))
                        snpreader1 = SnpData(iid=[["3","3"],["4","4"]],sid=[str(i) for i in range(snp_count)],val=np.array(np.random.randint(3,size=[2,snp_count]),dtype=dtype,order=order))

                        #has SNC
                        for has_SNC_in_train in [False, True]:
                            if has_SNC_in_train:
                                snpreader0.val[:,1] = 0

                            #missing data
                            for has_missing_data in [False, True]:
                                if has_missing_data:
                                    snpreader0.val[0,2]=np.nan
                                    snpreader1.val[0,2]=np.nan

                                #gather stats vs not
                                cppa, stdcppa = snpreader0.read(order=order,dtype=dtype).standardize(std,return_trained=True,force_python_only=False)
                                pya, stdpya = snpreader0.read(order=order,dtype=dtype).standardize(std,return_trained=True,force_python_only=True)
                                np.testing.assert_array_almost_equal(cppa.val, pya.val, decimal=10 if dtype==np.float64 else 5)

                                np.testing.assert_array_almost_equal(stdcppa.stats,stdpya.stats, decimal=10 if dtype==np.float64 else 5)
                                assert (np.inf in stdcppa.stats[:,1]) == has_SNC_in_train
                                assert (np.inf in stdpya.stats[:,1]) == has_SNC_in_train

                                if has_SNC_in_train:
                                    assert np.array_equal(cppa.val[:,1],np.zeros([cppa.val.shape[0]]))
                                    assert np.array_equal(pya.val[:,1],np.zeros([pya.val.shape[0]]))

                                if has_missing_data:
                                    assert 0 == cppa.val[0,2]
                                    assert 0 == pya.val[0,2]
                                        
                                #uses stats
                                cppb = snpreader1.read(order=order,dtype=dtype).standardize(stdcppa,force_python_only=False)
                                pyb = snpreader1.read(order=order,dtype=dtype).standardize(stdpya,force_python_only=True)
                                np.testing.assert_array_almost_equal(cppb.val, pyb.val, decimal=10 if dtype==np.float64 else 5)
                                np.testing.assert_array_almost_equal(stdcppa.stats,stdpya.stats, decimal=10 if dtype==np.float64 else 5) #Make sure we haven't messed up the train stats

                                if has_SNC_in_train:
                                    assert np.array_equal(cppb.val[:,1],np.zeros([cppb.val.shape[0]]))
                                    assert np.array_equal(pyb.val[:,1],np.zeros([pyb.val.shape[0]]))

                                if has_missing_data:
                                    assert cppb.val[0,2]==0
                                    assert pyb.val[0,2]==0
        logging.info("done with 'test_cpp_std'")
Пример #8
0
    def _read_pstdata(self):
        #LATER switch it, so the main code is here rather than in loadPhen
        if isinstance(self.filename,str):
            pheno_input = pstpheno.loadPhen(self.filename,missing=self.missing)
        elif self.filename is None:
            assert self._iid_if_none is not None, "If input is None then iid_if_none be given"
            pheno_input = {
            'header':np.empty((0),dtype='str'),
            'vals': np.empty((len(self._iid_if_none), 0)),
            'iid': self._iid_if_none
            }
        else:
            pheno_input = self.filename


        if len(pheno_input['vals'].shape) == 1:
            pheno_input = {
            'header' : pheno_input['header'],
            'vals' : np.reshape(pheno_input['vals'],(-1,1)),
            'iid' : pheno_input['iid']
            }

        if len(pheno_input['header']) > 0 and pheno_input['header'][0] is None:
            pheno_input['header'] = ["pheno{0}".format(i) for i in range(len(pheno_input['header']))] #LATER move to reader?
        elif len(pheno_input['header']) == 0:
            pheno_input['header'] = ["pheno{0}".format(i) for i in range(pheno_input['vals'].shape[1])]

        row = pheno_input['iid']
        col = np.array(pheno_input['header'],dtype='str')
        col_property = np.empty((len(col),3))
        col_property.fill(np.nan)
        val = pheno_input['vals']

        snpdata = SnpData(iid=row,sid=col,pos=col_property,val=val)
        return snpdata
Пример #9
0
 def test_merge_std(self):
     #unit vs beta
     for std in [stdizer.Beta(2, 10), stdizer.Unit()]:
         np.random.seed(0)
         sid_count = 20
         snpreader = SnpData(iid=[["0", "0"], ["1", "1"], ["2", "2"]],
                             sid=[str(i) for i in range(sid_count)],
                             val=np.array(np.random.randint(
                                 3, size=[3, sid_count]),
                                          dtype=np.float64,
                                          order='F'))
         kerneldata0, trained0, diag0 = SnpKernel(
             snpreader, std,
             block_size=1)._read_with_standardizing(to_kerneldata=True,
                                                    return_trained=True)
         kerneldata1, trained1, diag1 = SnpKernel(
             snpreader, std,
             block_size=None)._read_with_standardizing(to_kerneldata=True,
                                                       return_trained=True)
         np.testing.assert_array_almost_equal(kerneldata0.val,
                                              kerneldata1.val,
                                              decimal=10)
         np.testing.assert_array_almost_equal(trained0.stats,
                                              trained1.stats,
                                              decimal=10)
         assert abs(diag0.factor - diag1.factor) < 1e-7
Пример #10
0
 def test_pheno1(self):
     from pysnptools.snpreader import Bed, SnpData, SnpNpz
     some_snp_data = Bed(self.currentFolder + "/../../tests/datasets/generate/gen2.bed",count_A1=False).read()
     gen_snpdata = SnpData(iid=some_snp_data.iid,sid=["pheno"],val=_generate_phenotype(some_snp_data, 10, genetic_var=.5, noise_var=.5, seed=5).reshape(-1,1))
     #SnpNpz.write(r'c:\deldir\pheno1.snp.npz',gen_snpdata)
     ref_snpdata = SnpNpz(self.currentFolder + "/../../tests/datasets/generate/pheno1.snp.npz").read()
     assert gen_snpdata == ref_snpdata
Пример #11
0
    def _run_once(self):
        if (self._ran_once):
            return
        row_ascii, col_ascii, val, row_property, col_property = self._run_once_inner(
        )
        row = np.array(row_ascii,
                       dtype='str')  #!!!avoid this copy when not needed
        col = np.array(col_ascii,
                       dtype='str')  #!!!avoid this copy when not needed

        SnpData.__init__(self,
                         iid=row,
                         sid=col,
                         val=val,
                         pos=col_property,
                         name="np.memmap('{0}')".format(self._filename))
Пример #12
0
def _snps_fixup(snp_input, iid_if_none=None, count_A1=None):
    if isinstance(snp_input, str):
        return Bed(snp_input, count_A1=count_A1)

    if isinstance(snp_input, dict):
        return SnpData(iid=snp_input['iid'],
                       sid=snp_input['header'],
                       val=snp_input['vals'])

    if snp_input is None:
        assert iid_if_none is not None, "snp_input cannot be None here"
        return SnpData(iid_if_none,
                       sid=np.empty((0), dtype='str'),
                       val=np.empty((len(iid_if_none), 0)),
                       pos=np.empty((0, 3)),
                       name="")  #todo: make a static factory method on SnpData

    return snp_input
    def predict(self,X=None,K0_whole_test=None,K1_whole_test=None,iid_if_none=None,count_A1=None):
        """
        Method for predicting from a fitted :class:`FastLMM` predictor.
        If the examples in X, K0_whole_test, K1_whole_test are not the same, they will be reordered and intersected.

        :param X: testing covariate information, optional: 
          If you give a string, it should be the file name of a PLINK phenotype-formatted file.
        :type X: a PySnpTools :class:`SnpReader` (such as :class:`Pheno` or :class:`SnpData`) or string.

        :param K0_whole_test: Must be None. Represents the identity similarity matrix.
        :type K0_whole_test: None

        :param K1_whole_test: Must be None. Represents the identity similarity matrix.
        :type K1_whole_test: :class:`.SnpReader` or a string or :class:`.KernelReader`

        :param iid_if_none: Examples to predict for if no X, K0_whole_test, K1_whole_test is provided.
        :type iid_if_none: an ndarray of two strings

        :param count_A1: If it needs to read SNP data from a BED-formatted file, tells if it should count the number of A1
             alleles (the PLINK standard) or the number of A2 alleles. False is the current default, but in the future the default will change to True.
        :type count_A1: bool

        :rtype: A :class:`SnpData` of the means and a :class:`KernelData` of the covariance
        """

        assert self.is_fitted, "Can only predict after predictor has been fitted"
        assert K0_whole_test is None or isinstance(K0_whole_test,KernelIdentity) # could also accept no snps
        assert K1_whole_test is None or isinstance(K1_whole_test,KernelIdentity) # could also accept no snps

        X = _pheno_fixup(X,iid_if_none=iid_if_none,count_A1=count_A1)
        X = X.read().standardize(self.covar_unit_trained)

        # add a column of 1's to cov to increase DOF of model (and accuracy) by allowing a constant offset
        X = SnpData(iid=X.iid,
                              sid=FastLMM._new_snp_name(X),
                              val=np.c_[X.read().val,np.ones((X.iid_count,1))])
        assert np.array_equal(X.sid,self.covar_sid), "Expect covar sids to be the same in train and test."

        pheno_predicted = X.val.dot(self.beta).reshape(-1,1)
        ret0 = SnpData(iid = X.iid, sid=self.pheno_sid,val=pheno_predicted,pos=np.array([[np.nan,np.nan,np.nan]]),name="linear regression Prediction") #!!!replace 'parent_string' with 'name'

        from pysnptools.kernelreader import KernelData
        ret1 = KernelData(iid=X.iid,val=np.eye(X.iid_count)* self.ssres / self.iid_count)
        return ret0, ret1
Пример #14
0
    def _sel_plus_pc(self, h2, force_low_rank, force_full_rank, count_A1=None):
        do_plot = False
        use_cache = False

        # define file names
        bed_fn = self.pythonpath + "/tests/datasets/synth/all.bed"
        phen_fn = self.pythonpath + "/tests/datasets/synth/pheno_10_causals.txt"

        pcs_fn = os.path.join(self.tempout_dir, "sel_plus_pc.pcs.txt")
        if not (use_cache and os.path.exists(pcs_fn)):
            from fastlmm.util import compute_auto_pcs
            covar = compute_auto_pcs(bed_fn, count_A1=count_A1)
            logging.info("selected number of PCs: {0}".format(
                covar["vals"].shape[1]))
            Pheno.write(
                pcs_fn,
                SnpData(iid=covar['iid'],
                        sid=covar['header'],
                        val=covar['vals']))
        else:
            logging.info("Using top pcs's cache")
            covar = Pheno(pcs_fn)

        mf_name = "lmp"  #"lmpl" "local", "coreP", "nodeP", "socketP", "nodeE", "lmp"
        runner = mf_to_runner_function(mf_name)(20)

        logging.info(
            "Working on h2={0},force_low_rank={1},force_full_rank={2}".format(
                h2, force_low_rank, force_full_rank))
        result_file_name = "sel_plus_pc_{0}".format("h2IsHalf" if h2 ==
                                                    .5 else "h2Search")
        output_file_name = os.path.join(self.tempout_dir,
                                        result_file_name) + ".txt"
        results = single_snp_select(test_snps=bed_fn,
                                    G=bed_fn,
                                    pheno=phen_fn,
                                    k_list=[
                                        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 20,
                                        30, 40, 50, 60, 70, 80, 90, 100, 125,
                                        160, 200, 250, 320, 400, 500, 630, 800,
                                        1000
                                    ],
                                    h2=h2,
                                    n_folds=self.pythonpath +
                                    "/tests/datasets/synth/DebugEmitFolds.txt",
                                    covar=covar,
                                    output_file_name=output_file_name,
                                    force_low_rank=force_low_rank,
                                    force_full_rank=force_full_rank,
                                    GB_goal=2,
                                    count_A1=False
                                    #runner = runner
                                    )
        logging.info(results.head())
        self.compare_files(results, result_file_name)
Пример #15
0
    def read(self,
             order='F',
             dtype=np.float64,
             force_python_only=False,
             view_ok=False):
        """Reads the SNP values and returns a :class:`.SnpData` (with :attr:`.SnpData.val` property containing a new ndarray of the SNP values).

        :param order: {'F' (default), 'C', 'A'}, optional -- Specify the order of the ndarray. If order is 'F' (default),
            then the array will be in F-contiguous order (iid-index varies the fastest).
            If order is 'C', then the returned array will be in C-contiguous order (sid-index varies the fastest).
            If order is 'A', then the :attr:`.SnpData.val`
            ndarray may be in any order (either C-, Fortran-contiguous).
        :type order: string or None

        :param dtype: {numpy.float64 (default), numpy.float32}, optional -- The data-type for the :attr:`SnpData.val` ndarray.
        :type dtype: data-type

        :param force_python_only: optional -- If False (default), may use outside library code. If True, requests that the read
            be done without outside library code.
        :type force_python_only: bool

        :param view_ok: optional -- If False (default), allocates new memory for the :attr:`SnpData.val`'s ndarray. If True,
            if practical and reading from a :class:`SnpData`, will return a new 
            :class:`SnpData` with a ndarray shares memory with the original :class:`SnpData`.
            Typically, you'll also wish to use "order='A'" to increase the chance that sharing will be possible.
            Use these parameters with care because any change to either ndarray (for example, via :meth:`.SnpData.standardize`) will effect
            the others. Also keep in mind that :meth:`read` relies on ndarray's mechanisms to decide whether to actually
            share memory and so it may ignore your suggestion and allocate a new ndarray anyway.
        :type view_ok: bool

        :rtype: :class:`.SnpData`

        Calling the method again causes the SNP values to be re-read and creates a new in-memory :class:`.SnpData` with a new ndarray of SNP values.

        If you request the values for only a subset of the sids or iids, (to the degree practical) only that subset will be read from disk.

        :Example:

        >>> from pysnptools.snpreader import Bed
        >>> snp_on_disk = Bed('../../tests/datasets/all_chr.maf0.001.N300.bed',count_A1=False) # Specify SNP data on disk
        >>> snpdata1 = snp_on_disk.read() # Read all the SNP data returning a SnpData instance
        >>> print(type(snpdata1.val).__name__) # The SnpData instance contains a ndarray of the data.
        ndarray
        >>> subset_snpdata = snp_on_disk[:,::2].read() # From the disk, read SNP values for every other sid
        >>> print(subset_snpdata.val[0,0]) # Print the first SNP value in the subset
        2.0
        >>> subsub_snpdata = subset_snpdata[:10,:].read(order='A',view_ok=True) # Create an in-memory subset of the subset with SNP values for the first ten iids. Share memory if practical.
        >>> import numpy as np
        >>> # print np.may_share_memory(subset_snpdata.val, subsub_snpdata.val) # Do the two ndarray's share memory? They could. Currently they won't.       
        """
        dtype = np.dtype(dtype)
        val = self._read(None, None, order, dtype, force_python_only, view_ok)
        from pysnptools.snpreader import SnpData
        ret = SnpData(self.iid, self.sid, val, pos=self.pos, name=str(self))
        return ret
Пример #16
0
def _create_covar_chrom(covar, covar_by_chrom, chrom,count_A1=None):
    if covar_by_chrom is not None:
        covar_by_chrom_chrom = covar_by_chrom[chrom]
        covar_by_chrom_chrom = _pheno_fixup(covar_by_chrom_chrom, iid_if_none=covar,count_A1=count_A1)
        covar_after,  covar_by_chrom_chrom = pstutil.intersect_apply([covar,  covar_by_chrom_chrom])
        ret = SnpData(iid=covar_after.iid,sid=np.r_[covar_after.sid,covar_by_chrom_chrom.sid],
                      val=np.c_[covar_after.read(order='A',view_ok=True).val,
                                covar_by_chrom_chrom.read(order='A',view_ok=True).val]) #view_ok because np.c_ will allocate new memory.
        return ret
    else:
        return covar
Пример #17
0
    def g_mix(self,K0,K1):
        mixing = self.mixing

        if mixing == 1 or isinstance(K0, KernelIdentity):
            assert K1.standardizer is self.snp_trained1, "real assert"
            G_train = K1.train.read().standardize(self.snp_trained1).standardize(self.kernel_trained1)#!!!later this a good place to read?
            G_test = K1.test.read().standardize(self.snp_trained1).standardize(self.kernel_trained1)#!!!later this a good place to read?
            K = _SnpTrainTest(train=G_train,test=G_test,standardizer=SS_Identity(), block_size=None)
            return K

        if mixing == 0 or isinstance(K1, KernelIdentity):
            assert K0.standardizer is self.snp_trained0, "real assert"
            G_train = K0.train.read().standardize(self.snp_trained0).standardize(self.kernel_trained0)#!!!later this a good place to read?
            G_test = K0.test.read().standardize(self.snp_trained0).standardize(self.kernel_trained0)#!!!later this a good place to read?
            K = _SnpTrainTest(train=G_train,test=G_test,standardizer=SS_Identity(), block_size=None)
            return K

        #!!!later why are we processing the training data again????
        assert K0.standardizer is self.snp_trained0, "real assert"
        assert isinstance(K0, _SnpTrainTest), "Expect K0 to be a _SnpTrainTest"
        assert K1.standardizer is self.snp_trained1, "real assert"
        G0_train = K0.train.read().standardize(self.snp_trained0).standardize(self.kernel_trained0)#!!!later this a good place to read?
        G1_train = K1.train.read().standardize(self.snp_trained1).standardize(self.kernel_trained1)#!!!later this a good place to read?
        G0_test = K0.test.read().standardize(self.snp_trained0).standardize(self.kernel_trained0)#!!!later this a good place to read?
        G1_test = K1.test.read().standardize(self.snp_trained1).standardize(self.kernel_trained1)#!!!later this a good place to read?
        G_train = np.empty((K0.iid0_count, K0.train.sid_count + K1.train.sid_count))
        G_test = np.empty((K0.iid1_count, K0.train.sid_count + K1.train.sid_count))
        _mix_from_Gs(G_train, G0_train.val, G1_train.val, self.mixing)
        _mix_from_Gs(G_test, G0_test.val, G1_test.val, self.mixing)
        G_train = SnpData(iid=K0.iid0,
                            sid=np.concatenate((K0.train.sid,K1.train.sid),axis=0),
                            val=G_train,name="{0}&{1}".format(G0_train,G1_train),
                            pos=np.concatenate((K0.train.pos,K1.train.pos),axis=0)
                            )
        G_test = SnpData(iid=K0.iid1,
                            sid=np.concatenate((K0.train.sid,K1.train.sid),axis=0),
                            val=G_test,name="{0}&{1}".format(G0_test,G1_test),
                            pos=np.concatenate((K0.train.pos,K1.train.pos),axis=0)
                            )
        K = _SnpTrainTest(train=G_train,test=G_test,standardizer=SS_Identity(), block_size=None)
        return K
Пример #18
0
def _snps_fixup(snp_input, iid_if_none=None):
    if isinstance(snp_input, str):
        return Bed(snp_input)
    if snp_input is None:
        assert iid_if_none is not None, "snp_input cannot be None here"
        return SnpData(
            iid_if_none,
            sid=np.empty((0), dtype='str'),
            val=np.empty((len(iid_if_none), 0)),
            pos=np.empty((0, 3)),
            parent_string="")  #todo: make a static factory method on SnpData

    return snp_input
Пример #19
0
 def test_covar_by_chrom_mixing(self):
     logging.info(
         "TestSingleSnpLeaveOutOneChrom test_covar_by_chrom_mixing")
     test_snps = Bed(self.bedbase)
     pheno = self.phen_fn
     covar = self.cov_fn
     covar = Pheno(self.cov_fn).read()
     covar = SnpData(iid=covar.iid, sid=["pheno-1"], val=covar.val)
     covar_by_chrom = {chrom: self.cov_fn for chrom in xrange(1, 6)}
     output_file = self.file_name("covar_by_chrom_mixing")
     frame = single_snp(test_snps,
                        pheno,
                        covar=covar,
                        covar_by_chrom=covar_by_chrom,
                        output_file_name=output_file)
     self.compare_files(frame, "covar_by_chrom_mixing")
Пример #20
0
 def _read_pstdata(self):
     col, col_property = SnpReader._read_map_or_bim(self.filename,
                                                    remove_suffix="ped",
                                                    add_suffix="map")
     ped = np.loadtxt(self.filename, dtype='str', comments=None)
     ped = ped.reshape(-1, ped.shape[-1])  #Turns 1-d row into 2-d
     row = ped[:, 0:2]
     snpsstr = ped[:, 6::]
     inan = snpsstr == self.missing
     snps = np.zeros((snpsstr.shape[0], snpsstr.shape[1] // 2))
     for i in range(snpsstr.shape[1] // 2):
         snps[inan[:, 2 * i], i] = np.nan
         vals = snpsstr[~inan[:, 2 * i], 2 * i:2 * (i + 1)]
         if vals.shape[0] > 0:
             snps[~inan[:, 2 * i], i] += (vals == vals[0, 0]).sum(1)
     snpdata = SnpData(iid=row, sid=col, pos=col_property, val=snps)
     return snpdata
Пример #21
0
 def too_slow_test_write_bedbig(self):
     iid_count = 100000
     sid_count = 50000
     from pysnptools.snpreader import SnpData
     iid = np.array([[str(i), str(i)] for i in range(iid_count)])
     sid = np.array(["sid_{0}".format(i) for i in range(sid_count)])
     pos = np.array([[i, i, i] for i in range(sid_count)])
     np.random.seed(0)
     snpdata = SnpData(
         iid, sid, np.zeros((iid_count, sid_count)), pos=pos
     )  #random.choice((0.0,1.0,2.0,float("nan")),size=(iid_count,sid_count)))
     output = "tempdir/bedbig.{0}.{1}".format(iid_count, sid_count)
     create_directory_if_necessary(output)
     Bed.write(output, snpdata, count_A1=False)
     snpdata2 = Bed(output, count_A1=False).read()
     np.testing.assert_array_almost_equal(snpdata.val,
                                          snpdata2.val,
                                          decimal=10)
Пример #22
0
    def test_respect_inputs(self):
        np.random.seed(0)
        for dtype_start, decimal_start in [(np.float32, 5), (np.float64, 10)]:
            for order_start in ['F', 'C', 'A']:
                for sid_count in [20, 2]:
                    snpdataX = SnpData(
                        iid=[["0", "0"], ["1", "1"], ["2", "2"]],
                        sid=[str(i) for i in range(sid_count)],
                        val=np.array(np.random.randint(3, size=[3, sid_count]),
                                     dtype=dtype_start,
                                     order=order_start))
                    for stdx in [
                            stdizer.Beta(1, 25),
                            stdizer.Identity(),
                            stdizer.Unit()
                    ]:
                        for snpreader0 in [snpdataX, snpdataX[:, 1:]]:
                            snpreader1 = snpreader0[1:, :]

                            refdata0, trained_standardizer = snpreader0.read(
                            ).standardize(stdx,
                                          return_trained=True,
                                          force_python_only=True)
                            refval0 = refdata0.val.dot(refdata0.val.T)
                            refdata1 = snpreader1.read().standardize(
                                trained_standardizer, force_python_only=True
                            )  #LATER why aren't these used?
                            refval1 = refdata0.val.dot(
                                refdata1.val.T)  #LATER why aren't these used?
                            for dtype_goal, decimal_goal in [(np.float32, 5),
                                                             (np.float64, 10)]:
                                for order_goal in ['F', 'C', 'A']:
                                    k = snpreader0.read_kernel(
                                        standardizer=stdx,
                                        block_size=1,
                                        order=order_goal,
                                        dtype=dtype_goal)
                                    PstReader._array_properties_are_ok(
                                        k.val, order_goal, dtype_goal)
                                    np.testing.assert_array_almost_equal(
                                        refval0,
                                        k.val,
                                        decimal=min(decimal_start,
                                                    decimal_goal))
Пример #23
0
    def test_one(self):
        '''
        Lock in results on arbitrary data -- because meaningful runs take too long to run.
        '''
        fn = "one.txt"
        logging.info(fn)
        tmpOutfile = self.file_name(fn)

        half = self.pheno_whole.read().val
        pheno = SnpData(iid=self.pheno_whole.iid,sid=["pheno0","pheno1"],val=np.c_[half,half])

        spatial_coor = [[i,-i] for i in xrange(self.snpreader_whole.iid_count)]
        alpha_list = alpha_list_big=[int(v) for v in np.logspace(2,np.log10(4000), 2)]
        dataframe = heritability_spatial_correction(self.snpreader_whole,spatial_coor,self.snpreader_whole.iid,alpha_list,2,pheno,jackknife_count=2,permute_plus_count=1,permute_times_count=1,just_testing=True)

        dataframe.to_csv(tmpOutfile,sep="\t",index=False)
        referenceOutfile = TestFeatureSelection.reference_file("heritability_spatial_correction/"+fn)
        out,msg=ut.compare_files(tmpOutfile, referenceOutfile, tolerance)                
        self.assertTrue(out, "msg='{0}', ref='{1}', tmp='{2}'".format(msg, referenceOutfile, tmpOutfile))
Пример #24
0
def gen_Test_Bed(filename, n0, n1, m):
    n = n0 + n1
    iid = [["fam_" + str(i), "iid_" + str(i)] for i in range(0, n)]
    sid = ["snp_" + str(i) for i in range(0, m)]
    X = [[2.0 for i in range(0, m)] for i in range(0, n1)]
    X.extend([[0.0 for i in range(0, m)] for i in range(0, n0)])
    dat = SnpData(iid=iid, sid=sid, val=X)
    Bed.write(filename, dat)
    fil = open(filename + ".fam")
    lines = fil.readlines()
    fil.close()
    fil = open(filename + ".fam", "w")
    for i in range(0, len(lines)):
        l = lines[i]
        s = l.strip().split()
        if i < n1:
            s[5] = "2"
        else:
            s[5] = "1"
        l = " ".join(s) + "\n"
        fil.write(l)
    fil.close()
Пример #25
0
    def test_multipheno(self):
        logging.info("test_multipheno")

        random_state = RandomState(29921)
        pheno_reference = Pheno(self.phen_fn).read()
        for pheno_count in [2, 5, 1]:
            val = random_state.normal(loc=pheno_count,
                                      scale=pheno_count,
                                      size=(pheno_reference.iid_count,
                                            pheno_count))
            pheno_col = ['pheno{0}'.format(i) for i in range(pheno_count)]
            pheno_multi = SnpData(iid=pheno_reference.iid,
                                  sid=pheno_col,
                                  val=val)

            reference = pd.concat([
                single_snp(test_snps=self.bed,
                           pheno=pheno_multi[:, pheno_index],
                           covar=self.cov_fn)
                for pheno_index in range(pheno_count)
            ])
            frame = single_snp_scale(test_snps=self.bed,
                                     pheno=pheno_multi,
                                     covar=self.cov_fn)

            assert len(frame) == len(
                reference), "# of pairs differs from file '{0}'".format(
                    reffile)
            for sid in sorted(
                    set(reference.SNP
                        )):  #This ignores which pheno produces which pvalue
                pvalue_frame = np.array(
                    sorted(frame[frame['SNP'] == sid].PValue))
                pvalue_reference = np.array(
                    sorted(reference[reference['SNP'] == sid].PValue))
                assert (
                    abs(pvalue_frame - pvalue_reference) < 1e-5
                ).all, "pair {0} differs too much from reference".format(sid)
Пример #26
0
    def generate_and_analyze(seed,
                             N,
                             do_shuffle,
                             just_testing=True,
                             map_function=None,
                             cache_folder=None):

        #Generate SNPs
        snpdata = snp_gen(fst=.1,
                          dfr=0,
                          iid_count=N,
                          sid_count=1000,
                          chr_count=10,
                          label_with_pop=True,
                          seed=seed)
        K_causal = snpdata.read_kernel(Unit()).standardize()

        #Generate geo-spatial locations and K_loc
        distance_between_centers = 2500000
        x0 = distance_between_centers * 0.5
        x1 = distance_between_centers * 1.5
        y0 = distance_between_centers
        y1 = distance_between_centers
        sd = distance_between_centers / 4.

        spatial_iid = snpdata.iid
        center_dict = {"0": (x0, y0), "1": (x1, y1)}
        centers = np.array(
            [center_dict[iid_item[0]] for iid_item in spatial_iid])
        np.random.seed(seed)
        logging.info("Generating positions for seed {0}".format(seed))
        spatial_coor = SnpData(
            iid=snpdata.iid,
            sid=["x", "y"],
            val=centers + np.random.multivariate_normal(
                [0, 0], [[1, 0], [0, 1]], size=len(centers)) * sd,
            parent_string="'spatial_coor_gen_original'")
        alpha = distance_between_centers
        spatial_val = spatial_similarity(spatial_coor.val, alpha, power=2)
        K_loc = KernelData(iid=snpdata.iid, val=spatial_val).standardize()

        #Generate phenotype
        iid = K_causal.iid
        iid_count = K_causal.iid_count
        np.random.seed(seed)
        pheno_causal = SnpData(iid=iid,
                               sid=["causal"],
                               val=np.random.multivariate_normal(
                                   np.zeros(iid_count),
                                   K_causal.val).reshape(-1, 1),
                               parent_string="causal")
        np.random.seed(seed ^ 998372)
        pheno_noise = SnpData(iid=iid,
                              sid=["noise"],
                              val=np.random.normal(size=iid_count).reshape(
                                  -1, 1),
                              parent_string="noise")
        np.random.seed(seed ^ 12230302)
        pheno_loc_original = SnpData(iid=iid,
                                     sid=["loc_original"],
                                     val=np.random.multivariate_normal(
                                         np.zeros(iid_count),
                                         K_loc.val).reshape(-1, 1),
                                     parent_string="loc_original")

        if do_shuffle:
            idx = np.arange(iid_count)
            np.random.seed(seed)
            np.random.shuffle(idx)
            pheno_loc = pheno_loc_original.read(
                view_ok=True
            )  #don't need to copy, because the next line will be fresh memory
            pheno_loc.val = pheno_loc.val[idx, :]
        else:
            pheno_loc = pheno_loc_original

        pheno = SnpData(iid=iid,
                        sid=["pheno_all"],
                        val=pheno_causal.val + pheno_noise.val + pheno_loc.val)

        #Analyze data
        alpha_list = [
            int(v) for v in np.logspace(np.log10(100), np.log10(1e10), 100)
        ]
        dataframe = heritability_spatial_correction(
            snpdata,
            spatial_coor.val,
            spatial_iid,
            alpha_list=[alpha] if just_testing else alpha_list,
            pheno=pheno,
            alpha_power=2,
            jackknife_count=0,
            permute_plus_count=0,
            permute_times_count=0,
            just_testing=just_testing,
            map_function=map_function,
            cache_folder=cache_folder)

        logging.info(dataframe)
        return dataframe
Пример #27
0
    distance_between_centers = 2500000
    x0 = distance_between_centers * 0.5
    x1 = distance_between_centers * 1.5
    y0 = distance_between_centers
    y1 = distance_between_centers
    sd = distance_between_centers / 4.

    spatial_iid = snpdata.iid
    center_dict = {"0": (x0, y0), "1": (x1, y1)}
    centers = np.array([center_dict[iid_item[0]] for iid_item in spatial_iid])
    np.random.seed(seed)
    logging.info("Generating positions for seed {0}".format(seed))
    spatial_coor_gen_original = SnpData(
        iid=snpdata.iid,
        sid=["x", "y"],
        val=centers + np.random.multivariate_normal([0, 0], [[1, 0], [0, 1]],
                                                    size=len(centers)) * sd,
        parent_string="'spatial_coor_gen_original'")

    if do_plot:
        import matplotlib.pyplot as plt
        color_dict = {"0": "r", "1": "b", "2": "g"}
        colors = [color_dict[iid_item] for iid_item in snpdata.iid[:, 0]]
        plt.axis('equal')
        plt.scatter(spatial_coor_gen_original.val[:, 0],
                    spatial_coor_gen_original.val[:, 1],
                    c=colors)
        plt.show()

    from fastlmm.association.heritability_spatial_correction import spatial_similarity
    from pysnptools.kernelreader import KernelData
Пример #28
0
# [ 2.  2.  2. ...,  1.  2.  2.]
# [ 2.  2.  2. ...,  1.  2.  2.]
# [ 2.  2.  2. ...,  2.  0.  2.]]

# snpdata.val is a NumPy array. Can apply any np functions
print np.mean(snpdata.val)
#1.478588

#If all you want is to read data in a Numpy array, here it is one line:
print np.mean(Bed("all.bed").read().val)

#You can also create a SnpData object from scratch (without reading from a SnpReader)
from pysnptools.snpreader import SnpData

snpdata1 = SnpData(iid=[['f1', 'c1'], ['f1', 'c2'], ['f2', 'c1']],
                   sid=['snp1', 'snp2'],
                   val=[[0, 1], [2, .5], [.5, np.nan]])
print np.nanmean(snpdata1.val)
# 0.8

#Review SnpReader and Bed and SnpData, and common attributes including val

#Topics: Reading subsets of data, reading with re-ordering iids & sids (rows & cols), stacking

#Reading just one snp
snpreader = Bed("all.bed")
snp0reader = snpreader[:, 0]
print snp0reader, snp0reader.iid_count, snp0reader.sid_count, snp0reader.sid
# Bed('all.bed')[:,0] 500 1 ['snp625_m0_.03m1_.07']
print snpreader
# Bed("all.bed")
    def score(self, X=None, y=None, K0_whole_test=None, K1_whole_test=None, iid_if_none=None, return_mse_too=False, return_per_iid=False, count_A1=None):
        """
        Method for calculating the negative log likelihood of testing examples.
        If the examples in X,y,  K0_whole_test, K1_whole_test are not the same, they will be reordered and intersected.

        :param X: testing covariate information, optional: 
          If you give a string, it should be the file name of a PLINK phenotype-formatted file.
        :type X: a PySnpTools :class:`SnpReader` (such as :class:`Pheno` or :class:`SnpData`) or string.

        :param y: testing phenotype:
          If you give a string, it should be the file name of a PLINK phenotype-formatted file.
        :type y: a PySnpTools :class:`SnpReader` (such as :class:`Pheno` or :class:`SnpData`) or string.

        :param K0_whole_test: A similarity matrix from all the examples to the test examples. Alternatively,
               the test SNPs needed to construct such a similarity matrix.
               Can be any :class:`.SnpReader`. If you give a string, can be the name of a PLINK-formated Bed file.
               Can be PySnpTools :class:`.KernelReader`. If you give a string it can be the name of a :class:`.KernelNpz` file.
        :type K0_whole_test: :class:`.SnpReader` or a string or :class:`.KernelReader`

        :param K1_whole_test: A second similarity matrix from all the examples to the test examples. Alternatively,
               the test SNPs needed to construct such a similarity matrix.
               Can be any :class:`.SnpReader`. If you give a string, can be the name of a PLINK-formated Bed file.
               Can be PySnpTools :class:`.KernelReader`. If you give a string it can be the name of a :class:`.KernelNpz` file.
        :type K1_whole_test: :class:`.SnpReader` or a string or :class:`.KernelReader`

        :param iid_if_none: Examples to predict for if no X, K0_whole_test, K1_whole_test is provided.
        :type iid_if_none: an ndarray of two strings

        :param return_mse_too: If true, will also return the mean squared error.
        :type return_mse_too: bool

        :param count_A1: If it needs to read SNP data from a BED-formatted file, tells if it should count the number of A1
             alleles (the PLINK standard) or the number of A2 alleles. False is the current default, but in the future the default will change to True.
        :type count_A1: bool

        :param count_A1: If it needs to read SNP data from a BED-formatted file, tells if it should count the number of A1
             alleles (the PLINK standard) or the number of A2 alleles. False is the current default, but in the future the default will change to True.
        :type count_A1: bool

        :rtype: a float of the negative log likelihood and, optionally, a float of the mean squared error.
        """
        mean0, covar0 = self.predict(K0_whole_test=K0_whole_test,K1_whole_test=K1_whole_test,X=X,iid_if_none=iid_if_none,count_A1=count_A1)
        y = _pheno_fixup(y, iid_if_none=covar0.iid,count_A1=count_A1)
        mean, covar, y = intersect_apply([mean0, covar0, y])
        mean = mean.read(order='A',view_ok=True).val
        covar = covar.read(order='A',view_ok=True).val
        y_actual = y.read().val
        if not return_per_iid:
            var = multivariate_normal(mean=mean.reshape(-1), cov=covar)
            nll = -np.log(var.pdf(y_actual.reshape(-1)))
            if not return_mse_too:
                return nll
            else:
                mse = ((y_actual-mean)**2).sum()
                return nll, mse
        else:
            if not return_mse_too:
                result = SnpData(iid=y.iid,sid=['nLL'],val=np.empty((y.iid_count,1)),name="nLL")
                for iid_index in xrange(y.iid_count):
                    var = multivariate_normal(mean=mean[iid_index], cov=covar[iid_index,iid_index])
                    nll = -np.log(var.pdf(y_actual[iid_index]))
                    result.val[iid_index,0] = nll
                return result
            else:
               raise Exception("need code for mse_too")                                  
Пример #30
0
    def combine_the_best_way(K0, K1, covar, y, mixing, h2, force_full_rank=False, force_low_rank=False,snp_standardizer=None,kernel_standardizer=None,block_size=None):
        from pysnptools.kernelstandardizer import Identity as KS_Identity

        assert K0.iid0 is K0.iid1, "Expect K0 to be square"
        assert K1.iid0 is K1.iid1, "Expect K1 to be square"
        assert K0 is not None
        assert K1 is not None
        assert np.array_equal(K0.iid,K1.iid), "Expect K0 and K1 to having matching iids"
        assert kernel_standardizer is not None, "expect values for kernel_standardizer"

        mixer = _Mixer(False,KS_Identity(),KS_Identity(),mixing)

        sid_count_0 = _Mixer.sid_counter(K0, force_full_rank, force_low_rank)
        sid_count_1 = _Mixer.sid_counter(K1, force_full_rank, force_low_rank)

        #################################
        # Both Identity (or not given)
        #################################
        if sid_count_0 + sid_count_1 == 0:
            h2 = h2 or 0
            mixer.mixing = mixer.mixing or 0
            K = K0.read() #would be nice to use LinearRegression or low-rank with 0 snps

        #################################
        #
        #################################
        elif sid_count_0 + sid_count_1 < K0.iid_count or force_low_rank:
            mixer.do_g = True
            #!!!there is no need for block_size here because we want G0 in full. But if starting with SNPs and not low-rank then batches are needed and the two standardizers must be remembered for use later

            if sid_count_0 > 0:
                K0, mixer.snp_trained0, mixer.kernel_trained0 = K0._read_with_standardizing(to_kerneldata=not mixer.do_g, kernel_standardizer=kernel_standardizer, return_trained=True)
            if sid_count_1 > 0:
                K1, mixer.snp_trained1, mixer.kernel_trained1 = K1._read_with_standardizing(to_kerneldata=not mixer.do_g, kernel_standardizer=kernel_standardizer, return_trained=True)

            if sid_count_1 == 0:
                mixer.mixing = mixer.mixing or 0
                K = K0
            elif sid_count_0 == 0:
                mixer.mixing = mixer.mixing or 1
                K = K1
            else:
                if mixer.do_g:
                    G = np.empty((K0.iid_count, K0.sid_count + K1.sid_count))
                    if mixer.mixing is None:
                        mixer.mixing, h2 = _find_mixing_from_Gs(G, covar, K0.snpreader.val, K1.snpreader.val, h2, y)

                    if mixer.mixing == 0:
                        K = K0
                    elif mixer.mixing == 1:
                        K = K1
                    else:
                        _mix_from_Gs(G, K0.snpreader.val, K1.snpreader.val, mixer.mixing)
                        G = SnpData(iid=K0.iid,
                                            sid=["K0_{0}".format(i) for i in xrange(K0.sid_count)]+["K1_{0}".format(i) for i in xrange(K1.sid_count)], #rename the sids so that they can't collide.
                                            val=G,name="{0}&{1}".format(K0.snpreader,K1.snpreader),
                                            pos=np.concatenate((K0.pos,K1.pos),axis=0)
                                            )
                        K = SnpKernel(G,SS_Identity(),block_size=block_size)
        else:
            mixer.do_g = False
            if sid_count_0 > 0: #!!!but what if we have SNP data but still need to remember the standardizer?
                K0, mixer.snp_trained0, mixer.kernel_trained0 = K0._read_with_standardizing(to_kerneldata=True,return_trained=True)#!!!pass in a new argument, the kernel_standardizer(???)

            if sid_count_1 > 0:
                K1, mixer.snp_trained1, mixer.kernel_trained1 = K1._read_with_standardizing(to_kerneldata=True,return_trained=True)

            if sid_count_1 == 0:
                mixer.mixing = mixer.mixing or 0
                K = K0
            elif sid_count_0 == 0:
                mixer.mixing = mixer.mixing or 1
                K = K1
            else:
                K = np.empty(K0.val.shape)
                if mixer.mixing is None:
                    mixer.mixing, h2 = _find_mixing_from_Ks(K, covar, K0.val, K1.val, h2, y)
                _mix_from_Ks(K, K0.val, K1.val, mixer.mixing)
                assert K.shape[0] == K.shape[1] and abs(np.diag(K).sum() - K.shape[0]) < 1e-7, "Expect mixed K to be standardized"
                K = KernelData(val=K,iid=K0.iid)

        return K, h2, mixer
Пример #31
0
    from pysnptools.snpreader import Pheno, Bed
    import pysnptools.util as pstutil

    data_file = 'd:\OneDrive\programs\epiCornell\syndata.bed'
    if False:
        from pysnptools.snpreader import SnpData
        import numpy as np
        bed1 = Bed("../../tests/datasets/synth/all")
        print(bed1.iid_count, bed1.sid_count, bed1.iid_count * bed1.sid_count)
        #goal 1500 individuals x 27000 SNP
        snpdata1 = bed1.read()
        iid = bed1.iid
        sid = ['sid{0}'.format(i) for i in xrange(27000)]
        val = np.tile(snpdata1.val,(3,6))[:,:27000].copy()
        #snpdata = Pheno('pysnptools/examples/toydata.phe').read()         # Read data from Pheno format
        snpdata2 = SnpData(iid, sid, val)
        print(snpdata2.iid_count, snpdata2.sid_count, snpdata2.iid_count * snpdata2.sid_count)
        Bed.write(snpdata2,data_file,count_A1=False)

    synbed = Bed(data_file)
    print(synbed.iid_count, synbed.sid_count, synbed.iid_count * synbed.sid_count)

    part_count = 1000
    part_list = list(split_on_sids(synbed,part_count))

    pairs00 = _Pairs(part_list[0])
    from fastlmm.association import single_snp
    pheno_fn = r"d:\OneDrive\programs\epiCornell\pheno.txt"
    cov_fn = r"d:\OneDrive\programs\epiCornell\cov.txt"
    results_df = single_snp(pairs00, K0=synbed, pheno=pheno_fn, covar=cov_fn, leave_out_one_chrom=False, count_A1=True)
Пример #32
0
    def predict(self,X=None,K0_whole_test=None,K1_whole_test=None,iid_if_none=None):
        """
        Method for predicting from a fitted :class:`FastLMM` predictor.
        If the examples in X, K0_whole_test, K1_whole_test are not the same, they will be reordered and intersected.

        :param X: testing covariate information, optional: 
          If you give a string, it should be the file name of a PLINK phenotype-formatted file.
        :type X: a PySnpTools :class:`SnpReader` (such as :class:`Pheno` or :class:`SnpData`) or string.

        :param K0_whole_test: A similarity matrix from all the examples to the test examples. Alternatively,
               the test SNPs needed to construct such a similarity matrix.
               Can be any :class:`.SnpReader`. If you give a string, can be the name of a PLINK-formated Bed file.
               Can be PySnpTools :class:`.KernelReader`. If you give a string it can be the name of a :class:`.KernelNpz` file.
        :type K0_whole_test: :class:`.SnpReader` or a string or :class:`.KernelReader`

        :param K1_whole_test: A second similarity matrix from all the examples to the test examples. Alternatively,
               the test SNPs needed to construct such a similarity matrix.
               Can be any :class:`.SnpReader`. If you give a string, can be the name of a PLINK-formated Bed file.
               Can be PySnpTools :class:`.KernelReader`. If you give a string it can be the name of a :class:`.KernelNpz` file.
        :type K1_whole_test: :class:`.SnpReader` or a string or :class:`.KernelReader`

        :param iid_if_none: Examples to predict for if no X, K0_whole_test, K1_whole_test is provided.
        :type iid_if_none: an ndarray of two strings

        :rtype: A :class:`SnpData` of the means and a :class:`KernelData` of the covariance
        """

        assert self.is_fitted, "Can only predict after predictor has been fitted"
        #assert K0_whole_test is not None, "K0_whole_test must be given"
        #!!!later is it too wasteful to keep both G0_train, G1_train, and lmm.G when storing to disk?
        #!!!later all _kernel_fixup's should use block_size input

        K0_whole_test_b = _kernel_fixup(K0_whole_test, train_snps=self.G0_train, iid_if_none=iid_if_none, standardizer=self.mixer.snp_trained0, test=K0_whole_test, test_iid_if_none=None, block_size=self.block_size)
        K1_whole_test = _kernel_fixup(K1_whole_test, train_snps=self.G1_train, iid_if_none=K0_whole_test_b.iid0, standardizer=self.mixer.snp_trained1, test=K1_whole_test, test_iid_if_none=K0_whole_test_b.iid1, block_size=self.block_size)
        X = _pheno_fixup(X,iid_if_none=K0_whole_test_b.iid1)
        K0_whole_test_c, K1_whole_test, X = intersect_apply([K0_whole_test_b, K1_whole_test, X],intersect_before_standardize=True,is_test=True)
        X = X.read().standardize(self.covar_unit_trained)
        # add a column of 1's to cov to increase DOF of model (and accuracy) by allowing a constant offset
        X = SnpData(iid=X.iid,
                              sid=self._new_snp_name(X),
                              val=np.c_[X.read().val,np.ones((X.iid_count,1))])
        assert np.array_equal(X.sid,self.covar_sid), "Expect covar sids to be the same in train and test."

        train_idx0 = K0_whole_test_c.iid0_to_index(self.K_train_iid)
        K0_train_test = K0_whole_test_c[train_idx0,:]
        train_idx1 = K1_whole_test.iid0_to_index(self.K_train_iid)
        K1_train_test = K1_whole_test[train_idx1,:]
        test_idx0 = K0_whole_test_c.iid0_to_index(K0_whole_test_c.iid1)
        K0_test_test = K0_whole_test_c[test_idx0,:]
        if K0_test_test.iid0 is not K0_test_test.iid1:
            raise Exception("real assert")
        test_idx1 = K1_whole_test.iid0_to_index(K0_whole_test_c.iid1)
        K1_test_test = K1_whole_test[test_idx1,:]

        if self.mixer.do_g:
            ###################################################
            # low rank from Rasmussen  eq 2.9 + noise term added to covar
            ###################################################
            Gstar = self.mixer.g_mix(K0_train_test,K1_train_test)
            varg = self.h2 * self.sigma2
            vare = (1.-self.h2) * self.sigma2
            Ainv = LA.inv((1./vare) * np.dot(self.G.T,self.G) + (1./varg)*np.eye(self.G.shape[1]))
            testAinv = np.dot(Gstar.test.val, Ainv)
            pheno_predicted = np.dot(X.val,self.beta) + (1./vare) * np.dot(np.dot(testAinv,self.G.T),self.y-np.dot(self.X,self.beta))
            pheno_predicted = pheno_predicted.reshape(-1,1)
            covar  = np.dot(testAinv,Gstar.test.val.T) + vare * np.eye(Gstar.test.val.shape[0])

        else:
            lmm = LMM()
            lmm.U = self.U
            lmm.S = self.S
            lmm.G = self.G
            lmm.y = self.y
            lmm.Uy = self.Uy
            lmm.X = self.X
            lmm.UX = self.UX

            Kstar = self.mixer.k_mix(K0_train_test,K1_train_test) #!!!later do we need/want reads here? how about view_OK?
            lmm.setTestData(Xstar=X.val, K0star=Kstar.val.T)

            Kstar_star = self.mixer.k_mix(K0_test_test,K1_test_test) #!!!later do we need/want reads here?how about view_OK?
            pheno_predicted, covar = lmm.predict_mean_and_variance(beta=self.beta, h2=self.h2,sigma2=self.sigma2, Kstar_star=Kstar_star.val)

        #pheno_predicted = lmm.predictMean(beta=self.beta, h2=self.h2,scale=self.sigma2).reshape(-1,1)
        ret0 = SnpData(iid = X.iid, sid=self.pheno_sid,val=pheno_predicted,pos=np.array([[np.nan,np.nan,np.nan]]),name="lmm Prediction")

        from pysnptools.kernelreader import KernelData
        ret1 = KernelData(iid=K0_test_test.iid,val=covar)
        return ret0, ret1
Пример #33
0
    def test_lr_real(self):
        do_plot = False

        import pylab
        logging.info("TestLinRegTrain test_lr_real")

        train_idx = np.r_[10:self.snpreader_whole.iid_count]  # iids 10 and on
        test_idx = np.r_[0:10]  # the first 10 iids

        #make covar just numbers 0,1,...
        covar = self.covariate_whole.read()
        covar.val = np.array([[float(num)] for num in xrange(covar.iid_count)])
        covariate_train = covar[train_idx, :].read()
        covariate_test = covar[test_idx, :].read()
        K0_test_test = KernelIdentity(covariate_test.iid)

        #make pheno  # pheno = 2*covar+100+normal(0,1)*10
        pheno = self.pheno_whole.read()
        np.random.seed(0)
        pheno.val = covar.val * 2.0 + 100 + np.random.normal(
            size=covar.val.shape) * 10

        pheno_train = pheno[train_idx, :].read()
        pheno_test = pheno[test_idx, :].read()

        if do_plot:
            #Plot training x and y, testing x and y
            pylab.plot(covariate_train.val, pheno_train.val, ".",
                       covariate_test.val, pheno_test.val, ".")
            pylab.suptitle("Plot training x and y, testing x and y")
            pylab.show()

        Xtrain = np.c_[covariate_train.val,
                       np.ones((covariate_train.iid_count, 1))]
        Xtest = np.c_[covariate_test.val,
                      np.ones((covariate_test.iid_count, 1))]
        lsqSol = np.linalg.lstsq(Xtrain, pheno_train.val[:, 0], rcond=-1)
        bs = lsqSol[0]  #weights
        r2 = lsqSol[1]  #squared residuals
        D = lsqSol[2]  #rank of design matrix
        N = pheno_train.iid_count
        REML = False
        if not REML:
            sigma2 = float(r2 / N)
            nLL = N * 0.5 * np.log(2 * np.pi * sigma2) + N * 0.5
        else:
            sigma2 = float(r2 / (N - D))
            nLL = N * 0.5 * np.log(2 * np.pi * sigma2) + 0.5 / sigma2 * r2
            nLL -= 0.5 * D * np.log(2 * np.pi * sigma2)
            #REML term

        predicted = Xtest.dot(bs)
        yerr = [np.sqrt(sigma2)] * len(predicted)
        if do_plot:
            pylab.plot(covariate_test.val, pheno_test.val, "g.",
                       covariate_test.val, predicted, "r.")
            pylab.xlim([-1, 10])
            pylab.errorbar(covariate_test.val,
                           predicted,
                           yerr,
                           linestyle='None')
            pylab.suptitle("real linear regression: actual to prediction")
            pylab.show()

        #These should all give the same result
        first_name = None
        for name, K0_train, K0_whole_test in [("Identity Kernel", None, None)]:

            first_name = first_name or name
            #Learn model, save, load
            modelx = LinearRegression().fit(K0_train=K0_train,
                                            X=covariate_train,
                                            y=pheno_train)

            filename = self.tempout_dir + "/model_lr_real.flm.p"
            pstutil.create_directory_if_necessary(filename)
            joblib.dump(modelx, filename)
            model = joblib.load(filename)

            do_test_on_train = True
            if do_test_on_train:
                #Predict with model (test on train)
                predicted_pheno, covar = model.predict(
                    K0_whole_test=K0_train, X=covariate_train)  #test on train
                output_file = self.file_name("lr_reala_" + name)
                Dat.write(output_file, predicted_pheno)
                covar2 = SnpData(
                    iid=covar.row, sid=covar.col[:, 1],
                    val=covar.val)  #kludge to write kernel to text format
                output_file = self.file_name("lr_reala.cov_" + name)
                Dat.write(output_file, covar2)

                yerr = np.sqrt(np.diag(covar.val))
                predicted = predicted_pheno.val
                if do_plot:
                    pylab.plot(covariate_train.val, pheno_train.val, "g.",
                               covariate_train.val, predicted, "r.")
                    pylab.xlim([0, 50])
                    pylab.ylim([100, 200])
                    pylab.errorbar(covariate_train.val,
                                   predicted,
                                   yerr,
                                   linestyle='None')
                    pylab.suptitle(
                        name +
                        ": test on train: train X to true target (green) and prediction (red)"
                    )
                    pylab.show()

                self.compare_files(predicted_pheno, "lr2a_" + first_name)
                self.compare_files(covar2, "lr2a.cov_" + first_name)

            #Predict with model (test on test)
            predicted_pheno, covar = model.predict(
                K0_whole_test=K0_whole_test, X=covariate_test)  #test on train
            output_file = self.file_name("lr_realb_" + name)
            Dat.write(output_file, predicted_pheno)
            covar2 = SnpData(
                iid=covar.row, sid=covar.col[:, 1],
                val=covar.val)  #kludge to write kernel to text format
            output_file = self.file_name("lr_realb.cov_" + name)
            Dat.write(output_file, covar2)

            yerr = np.sqrt(np.diag(covar.val))
            predicted = predicted_pheno.val
            if do_plot:
                pylab.plot(covariate_test.val, pheno_test.val, "g.",
                           covariate_test.val, predicted, "r.")
                pylab.xlim([-1, 10])
                pylab.errorbar(covariate_test.val,
                               predicted,
                               yerr,
                               linestyle='None')
                pylab.suptitle(
                    name +
                    ": test on test: test X to true target (green) and prediction (red)"
                )
                pylab.show()
                ## Plot y and predicted y (test on train)
                #pylab.plot(pheno_test.val,predicted_pheno.val,".")
                #pylab.suptitle(name+": test on test: true target to prediction")
                #pylab.show()

            self.compare_files(predicted_pheno, "lr2b_" + first_name)
            self.compare_files(covar2, "lr2b.cov_" + first_name)
Пример #34
0
def snp_gen(fst,
            dfr,
            iid_count,
            sid_count,
            maf_low=.05,
            maf_high=.5,
            seed=0,
            sibs_per_family=10,
            freq_pop_0=.5,
            chr_count=None,
            label_with_pop=False):
    """Generates a random :class:`.SnpData`

    :param fst: Degree of Population Structure, e.g. 0 (a special case), 0.005, 0.01, 0.05, 0.1
    :type fst: float

    :param dfr: Degree of Family Relatedness, the fraction of individuals belonging to a family, ie. fracSibs, e.g. 0.0, 0.5, 0.6, 0.7, 0.8, 0.9
    :type dfr: float

    :param iid_count: The number of individuals to generate. Because of rounding the actual number may be less.
    :type iid_count: int

    :param sid_count: The number of snps to generate.
    :type sid_count: int

    :param maf_low: (default .05) lower bound of uniformly-generated Minor allele frequency
    :type maf_low: float

    :param maf_high: (default .5) upper bound of uniformly-generated Minor allele frequency
    :type maf_high: float

    :param seed: (default 0) Random seed
    :type seed: int

    :param sibs_per_family: (default 10) number of siblings in each family
    :type sibs_per_family: int

    :param freq_pop_0: (default .5) Fraction of individuals in population 0 (the rest will be in population 1)
    :type freq_pop_0: float

    :param chr_count: (default one chromosome per SNP) Number of chromosomes to which SNPs should be assigned. The SNPs will
    be assigned as evenly as possible. Chromosome names are integers starting with 1. SNP positions within a chromosome are sequential
    integers starting with 1.
    :type chr_count: int

    :rtype: :class:`.SnpData`
    :Example:

    >>> snpdata = snp_gen(fst=.1,dfr=.5,iid_count=200,sid_count=20,maf_low=.05,seed=6)
    >>> print int(snpdata.iid_count), int(snpdata.sid_count) #because of rounding got 190 individuals
    190 20

    """
    assert 0 <= freq_pop_0 and freq_pop_0 <= 1.0, "assert 0 <= freq_pop_0 and freq_pop_0 <=1.0"

    if seed is not None:
        np.random.seed(int(seed % sys.maxint))

    iid_solo_count = iid_count - iid_count * dfr
    family_count = int(iid_count * dfr / (2 * sibs_per_family))

    ancestral = np.random.uniform(
        maf_low, maf_high, sid_count)  #sample ancestral allele frequencies

    snp_list = []
    for population_index, freq_pop in enumerate([freq_pop_0,
                                                 1.0 - freq_pop_0]):
        logging.info("Simulating SNPs from a population %i" % population_index)
        snps_parents = _generate_snps(ancestral, fst,
                                      int(iid_solo_count * freq_pop),
                                      sid_count)
        snp_list.append(snps_parents)
        snp_list.append(
            _generate_kids(parent_snps=snps_parents,
                           family_count=int(freq_pop * family_count),
                           sibs_per_family=sibs_per_family))

    snp_list.append(
        _generate_kids(parent_snps=np.concatenate(snp_list),
                       family_count=family_count,
                       sibs_per_family=sibs_per_family))
    val = np.concatenate(snp_list)

    if not label_with_pop:
        iid = np.array([["i_{0}".format(iid_index), "f_{0}".format(iid_index)]
                        for iid_index in xrange(val.shape[0])],
                       dtype=str).reshape(-1, 2)
    else:
        assert len(snp_list) == 5, "real assert"
        iid0 = [["0", str(iid_index)]
                for iid_index in xrange(len(snp_list[0]) + len(snp_list[1]))
                ]  #parents and children of pop 0
        iid1 = [["1", str(iid_index)]
                for iid_index in xrange(len(snp_list[2]) + len(snp_list[3]))
                ]  #parents and children of pop 1
        iid2 = [["2", str(iid_index)] for iid_index in xrange(len(snp_list[4]))
                ]  #children with parents in any pop
        iid = np.array(iid0 + iid1 + iid2, dtype=str).reshape(-1, 2)

    sid = np.array(
        ["snp_{0}".format(sid_index) for sid_index in xrange(val.shape[1])],
        dtype=str)

    if chr_count is None:
        chr_count = len(sid)

    assert len(
        sid
    ) == 0 or chr_count > 0, "chr_count must be at least 1 (unless sid_count is 0)"
    sid_per_chrom = int(sp.ceil(float(len(sid)) / max(1, chr_count)))
    pos = np.array(
        list([
            1 + sid_index // sid_per_chrom, 1 + sid_index % sid_per_chrom, 1 +
            sid_index % sid_per_chrom
        ] for sid_index in xrange(len(sid))))
    if len(sid) == 0:  #make it work when no sids are wanted
        pos = pos.reshape(len(sid), 3)

    snpdata = SnpData(
        iid=iid,
        sid=sid,
        val=val,
        pos=pos,
        parent_string=
        "snp_gen(fst={0}, dfr={1}, iid_count={2}, sid_count={3}, maf_low={4}, maf_high={5}, seed={6}, sibs_per_family={7}, freq_pop_0={8})"
        .format(fst, dfr, iid_count, sid_count, maf_low, maf_high, seed,
                sibs_per_family, freq_pop_0))

    if snpdata.iid_count != iid_count:
        logging.warn(
            "Because of rounding the actual number of iids is {0} rather than the requested {1}"
            .format(snpdata.iid_count, iid_count))

    return snpdata
Пример #35
0
    def test_cpp_std(self):

        #Order C vs F
        for order in ['C', 'F']:
            #32 vs 64
            for dtype in [np.float64, np.float32]:
                #unit vs beta
                for std in [stdizer.Unit(), stdizer.Beta(2, 10)]:
                    np.random.seed(0)
                    sid_count = 20
                    snpreader0 = SnpData(
                        iid=[["0", "0"], ["1", "1"], ["2", "2"]],
                        sid=[str(i) for i in range(sid_count)],
                        val=np.array(np.random.randint(3, size=[3, sid_count]),
                                     dtype=dtype,
                                     order=order))
                    snpreader1 = SnpData(
                        iid=[["3", "3"], ["4", "4"]],
                        sid=[str(i) for i in range(sid_count)],
                        val=np.array(np.random.randint(3, size=[2, sid_count]),
                                     dtype=dtype,
                                     order=order))

                    #has SNC
                    for has_SNC_in_train in [False, True]:
                        if has_SNC_in_train:
                            snpreader0.val[:, 1] = 0

                        #missing data
                        for has_missing_data in [False, True]:
                            if has_missing_data:
                                snpreader0.val[0, 2] = np.nan
                                snpreader1.val[0, 2] = np.nan

                            #gather stats vs not
                            cppa, stdcppa = snpreader0.read(
                                order=order, dtype=dtype).standardize(
                                    std,
                                    return_trained=True,
                                    force_python_only=False)
                            pya, stdpya = snpreader0.read(
                                order=order, dtype=dtype).standardize(
                                    std,
                                    return_trained=True,
                                    force_python_only=True)
                            np.testing.assert_array_almost_equal(
                                cppa.val,
                                pya.val,
                                decimal=10 if dtype == np.float64 else 5)

                            np.testing.assert_array_almost_equal(
                                stdcppa.stats,
                                stdpya.stats,
                                decimal=10 if dtype == np.float64 else 5)
                            assert (np.inf
                                    in stdcppa.stats[:, 1]) == has_SNC_in_train
                            assert (np.inf
                                    in stdpya.stats[:, 1]) == has_SNC_in_train

                            if has_SNC_in_train:
                                assert np.array_equal(
                                    cppa.val[:, 1],
                                    np.zeros([cppa.val.shape[0]]))
                                assert np.array_equal(
                                    pya.val[:, 1],
                                    np.zeros([pya.val.shape[0]]))

                            if has_missing_data:
                                assert 0 == cppa.val[0, 2]
                                assert 0 == pya.val[0, 2]

                            #uses stats
                            cppb = snpreader1.read(order=order,
                                                   dtype=dtype).standardize(
                                                       stdcppa,
                                                       force_python_only=False)
                            pyb = snpreader1.read(order=order,
                                                  dtype=dtype).standardize(
                                                      stdpya,
                                                      force_python_only=True)
                            np.testing.assert_array_almost_equal(
                                cppb.val,
                                pyb.val,
                                decimal=10 if dtype == np.float64 else 5)
                            np.testing.assert_array_almost_equal(
                                stdcppa.stats,
                                stdpya.stats,
                                decimal=10 if dtype == np.float64 else 5
                            )  #Make sure we haven't messed up the train stats

                            if has_SNC_in_train:
                                assert np.array_equal(
                                    cppb.val[:, 1],
                                    np.zeros([cppb.val.shape[0]]))
                                assert np.array_equal(
                                    pyb.val[:, 1],
                                    np.zeros([pyb.val.shape[0]]))

                            if has_missing_data:
                                assert cppb.val[0, 2] == 0
                                assert pyb.val[0, 2] == 0
        logging.info("done with 'test_cpp_std'")