Exemplo n.º 1
0
def load_data(snp_file, pheno_file, covar_file):
    # Load SNP data
    snp_reader = Bed(snp_file)

    # Load phenotype
    pheno = pysnptools.util.pheno.loadPhen(pheno_file)

    # Load covariates
    if covar_file is not None:
        covar = pysnptools.util.pheno.loadPhen(covar_file)
        snp_reader, pheno, covar = srutil.intersect_apply([snp_reader, pheno, covar])
        covar = covar['vals']
    else:
        snp_reader, pheno = srutil.intersect_apply([snp_reader, pheno])
        covar = None

    snp_data = snp_reader.read().standardize()
    Y = pheno['vals']
    Y -= Y.mean(0)
    Y /= Y.std(0)

    X = 1./np.sqrt((snp_data.val**2).sum() / float(snp_data.iid_count)) * snp_data.val
    K = np.dot(X, X.T) # TODO use symmetric dot to speed this up

    assert np.all(pheno['iid'] == snp_data.iid), "the samples are not sorted"

    return snp_data, pheno, covar, X, Y, K
Exemplo n.º 2
0
def getData(filename="",mph=3,UseCov=False):
	sFil=Bed(filename);
	yFil=Pheno(filename+".fam");
	
	Q=[];
	if isfile(filename+".cov") and UseCov:
		QFil=Pheno(filename+".cov")
		[sFil,yFil,QFil]=intersect_apply([sFil,yFil,QFil])				
	if isfile(filename+".phen"):
		yFil=Pheno(filename+".phen");
		[sFil,yFil]=intersect_apply([sFil,yFil])				
	return [yFil,sFil];
Exemplo n.º 3
0
def getData(filename="", mph=3, UseCov=False):
    sFil = Bed(filename)
    yFil = Pheno(filename + ".fam")

    Q = []
    if isfile(filename + ".cov") and UseCov:
        QFil = Pheno(filename + ".cov")
        [sFil, yFil, QFil] = intersect_apply([sFil, yFil, QFil])
    if isfile(filename + ".phen"):
        yFil = Pheno(filename + ".phen")
        [sFil, yFil] = intersect_apply([sFil, yFil])
    return [yFil, sFil]
Exemplo n.º 4
0
        def nested_closure(chrom):
            test_snps_chrom = test_snps[:, test_snps.pos[:, 0] == chrom]
            covar_chrom = _create_covar_chrom(covar, covar_by_chrom, chrom)
            cache_file_chrom = None if cache_file is None else cache_file + ".{0}".format(
                chrom)

            K0_chrom = _K_per_chrom(K0 or G0 or test_snps, chrom,
                                    test_snps.iid)
            K1_chrom = _K_per_chrom(K1 or G1, chrom, test_snps.iid)

            K0_chrom, K1_chrom, test_snps_chrom, pheno_chrom, covar_chrom = pstutil.intersect_apply(
                [K0_chrom, K1_chrom, test_snps_chrom, pheno, covar_chrom])
            logging.debug("# of iids now {0}".format(K0_chrom.iid_count))
            K0_chrom, K1_chrom, block_size = _set_block_size(
                K0_chrom, K1_chrom, mixing, GB_goal, force_full_rank,
                force_low_rank)

            distributable = _internal_single(
                K0=K0_chrom,
                test_snps=test_snps_chrom,
                pheno=pheno_chrom,
                covar=covar_chrom,
                K1=K1_chrom,
                mixing=mixing,
                h2=h2,
                log_delta=log_delta,
                cache_file=cache_file_chrom,
                force_full_rank=force_full_rank,
                force_low_rank=force_low_rank,
                output_file_name=None,
                block_size=block_size,
                interact_with_snp=interact_with_snp,
                runner=Local())

            return distributable
Exemplo n.º 5
0
    def load_data(self):
        """load data
        """
        with patch.dict('os.environ', {'ARRAY_MODULE': 'numpy'}) as _:
        
            tt0 = time.time()
            logging.info("loading data...")

            if self.num_snps_in_memory <= self.snpreader.iid_count : raise Exception("Expect self.num_snps_in_memory, {0} > self.snpreader.iid_count, {1}".format(self.num_snps_in_memory, self.total_num_ind))

            self.sid = pd.Series(self.snpreader.sid)

            # load phenotype
            pheno = pstpheno.loadOnePhen(self.pheno_fn,self.mpheno, vectorize=True)
            self.ind_iid = pheno['iid'] #!!LATER: bug? It looks like we record the pre-intersect iids only to write out the pcs later? Why?

            # load covariates
            self.X, cov_iid = self.load_covariates(pheno)

            # Set up the snps
            # G is the standardized snps. The GClass.factory will either load them into memory or will note their file and read them as needed.
            self.G = GClass.factory(self.snpreader, self.num_snps_in_memory, self.standardizer, self.blocksize,count_A1=self.count_A1)

            #!!LATER Should we give preference to self.G since reordering it is the most expensive?
            (self.y, yiid), (self.X, xiid), self.G = pstutil.intersect_apply([(pheno['vals'], pheno['iid']), (self.X, cov_iid), self.G], sort_by_dataset=False)

            # make sure input data isn't modified
            self.X.flags.writeable = False
            self.y.flags.writeable = False

            logging.info("...done. Loading time %.2f s" % (float(time.time() - tt0)))
Exemplo n.º 6
0
 def set_sid_sets(self):
     sid_set_0 = set(self.sid_list_0)
     self.intersect = sid_set_0.intersection(self.sid_list_1)
     self.just_sid_0 = sid_set_0.difference(self.intersect)
     self.just_sid_1 = self.intersect.symmetric_difference(self.sid_list_1)
     self._pair_count = len(self.just_sid_0)*len(self.intersect) + len(self.just_sid_0)*len(self.just_sid_1) + len(self.intersect)*len(self.just_sid_1) + len(self.intersect) * (len(self.intersect)-1)//2
     self.test_snps, self.pheno, self.covar, self.G0, self.G1_or_none = pstutil.intersect_apply([self.test_snps, self.pheno, self.covar, self.G0, self.G1_or_none]) #should put G0 and G1 first
Exemplo n.º 7
0
    def test_intersection(self):

        from pysnptools.standardizer import Unit
        from pysnptools.kernelreader import SnpKernel
        from pysnptools.snpreader import Pheno
        from pysnptools.kernelreader._subset import _KernelSubset
        from pysnptools.snpreader._subset import _SnpSubset
        from pysnptools.util import intersect_apply

        snps_all = Bed(self.currentFolder + "/../examples/toydata.5chrom.bed",
                       count_A1=False)
        k = SnpKernel(snps_all, stdizer.Identity())

        pheno = Pheno(self.currentFolder + "/../examples/toydata.phe")
        pheno = pheno[1:, :]  # To test intersection we remove a iid from pheno

        k1, pheno = intersect_apply([
            k, pheno
        ])  #SnpKernel is special because it standardizes AFTER intersecting.
        assert isinstance(k1.snpreader,
                          _SnpSubset) and not isinstance(k1, _KernelSubset)

        #What happens with fancy selection?
        k2 = k[::2]
        assert isinstance(k2, SnpKernel)

        logging.info("Done with test_intersection")
Exemplo n.º 8
0
 def set_sid_sets(self):
     sid_set_0 = set(self.sid_list_0)
     self.intersect = sid_set_0.intersection(self.sid_list_1)
     self.just_sid_0 = sid_set_0.difference(self.intersect)
     self.just_sid_1 = self.intersect.symmetric_difference(self.sid_list_1)
     self._pair_count = len(self.just_sid_0)*len(self.intersect) + len(self.just_sid_0)*len(self.just_sid_1) + len(self.intersect)*len(self.just_sid_1) + len(self.intersect) * (len(self.intersect)-1)//2
     self.test_snps, self.pheno, self.covar, self.G0, self.G1_or_none = pstutil.intersect_apply([self.test_snps, self.pheno, self.covar, self.G0, self.G1_or_none]) #should put G0 and G1 first
Exemplo n.º 9
0
    def load_data(self):
        """load data
        """

        
        tt0 = time.time()
        logging.info("loading data...")

        if self.num_snps_in_memory <= self.snpreader.iid_count : raise Exception("Expect self.num_snps_in_memory, {0} > self.snpreader.iid_count, {1}".format(self.num_snps_in_memory, self.total_num_ind))

        self.sid = pd.Series(self.snpreader.sid)

        # load phenotype
        pheno = pstpheno.loadOnePhen(self.pheno_fn,self.mpheno, vectorize=True)
        self.ind_iid = pheno['iid'] #!!LATER: bug? It looks like we record the pre-intersect iids only to write out the pcs later? Why?

        # load covariates
        self.X, cov_iid = self.load_covariates(pheno)

        # Set up the snps
        # G is the standardized snps. The GClass.factory will either load them into memory or will note their file and read them as needed.
        self.G = GClass.factory(self.snpreader, self.num_snps_in_memory, self.standardizer, self.blocksize)

        #!!LATER Should we give preference to self.G since reordering it is the most expensive?
        (self.y, yiid), (self.X, xiid), self.G = pstutil.intersect_apply([(pheno['vals'], pheno['iid']), (self.X, cov_iid), self.G], sort_by_dataset=False)

        # make sure input data isn't modified
        self.X.flags.writeable = False
        self.y.flags.writeable = False

        logging.info("...done. Loading time %.2f s" % (float(time.time() - tt0)))
Exemplo n.º 10
0
def loadPheno(bed, phenoFile, missingPhenotype="-9", keepDict=False):
    pheno = phenoUtils.loadOnePhen(phenoFile, missing=missingPhenotype, vectorize=True)
    checkIntersection(bed, pheno, "phenotypes")
    bed, pheno = pstutil.intersect_apply([bed, pheno])
    if not keepDict:
        pheno = pheno["vals"]
    return bed, pheno
Exemplo n.º 11
0
def load_snp_data(snpreader,
                  pheno_fn,
                  cov_fn=None,
                  offset=True,
                  mpheno=0,
                  standardizer=Unit()):
    """Load plink files
    ----------

    snpreader : snpreader object
        object to read in binary SNP file

    pheno_fn : str
        File name of phenotype file

    cov_fn : str
        File name of covariates file

    offset : bool, default=True
        Adds offset to the covariates specified in cov_fn, if neccesssary


    Returns
    -------
    G : array, shape = [n_samples, n_features]
        SNP matrix

    X : array, shape = [n_samples, n_covariates]
        Matrix of covariates (e.g. age, gender)

    y : array, shape = [n_samples]
        Phenotype (target) vector

    """

    #TODO: completely remove this
    pheno = pstpheno.loadOnePhen(pheno_fn, mpheno, vectorize=True)
    geno = snpreader.read(order='C').standardize(standardizer)

    # sanity check
    #assert np.testing.assert_array_equal(ind_iid, pheno['iid'][indarr[:,0]])

    # load covariates or generate vector of ones (for bias)
    if cov_fn == None:
        cov = {'vals': np.ones((len(pheno['iid']), 1)), 'iid': pheno['iid']}
    else:
        cov = pstpheno.loadPhen(cov_fn)

    (y, yiid), G, (X, xiid) = pstutil.intersect_apply(
        [(pheno['vals'], pheno['iid']), geno, (cov['vals'], cov['iid'])],
        sort_by_dataset=False)
    G = G.read(order='C', view_ok=True)

    # add bias column if not present
    if offset and sp.all(X.std(0) != 0):
        offset = sp.ones((len(indarr), 1))
        X = sp.hstack((X, offset))

    return G, X, y
Exemplo n.º 12
0
def loadPheno(bed, phenoFile, missingPhenotype='-9', keepDict=False):
    pheno = phenoUtils.loadOnePhen(phenoFile,
                                   missing=missingPhenotype,
                                   vectorize=True)
    checkIntersection(bed, pheno, 'phenotypes')
    bed, pheno = pstutil.intersect_apply([bed, pheno])
    if (not keepDict): pheno = pheno['vals']
    return bed, pheno
Exemplo n.º 13
0
def loadCovars(bed, covarFile):
    covarsDict = phenoUtils.loadOnePhen(covarFile, vectorize=False)
    checkIntersection(bed, covarsDict, "covariates", checkSuperSet=True)
    _, covarsDict = pstutil.intersect_apply([bed, covarsDict])
    covar = covarsDict["vals"]
    covar -= np.mean(covar, axis=0)
    covar /= np.std(covar, axis=0)
    return covar
Exemplo n.º 14
0
def loadCovars(bed, covarFile):
    covarsDict = phenoUtils.loadOnePhen(covarFile, vectorize=False)
    checkIntersection(bed, covarsDict, 'covariates', checkSuperSet=True)
    _, covarsDict = pstutil.intersect_apply([bed, covarsDict])
    covar = covarsDict['vals']
    covar -= np.mean(covar, axis=0)
    covar /= np.std(covar, axis=0)
    return covar
Exemplo n.º 15
0
def loadRelatedFile(bed, relFile):
    relatedDict = phenoUtils.loadOnePhen(relFile, vectorize=True)
    checkIntersection(bed, relatedDict, "relatedness", checkSuperSet=True)
    _, relatedDict = pstutil.intersect_apply([bed, relatedDict])
    related = relatedDict["vals"]
    keepArr = related < 0.5
    print np.sum(~keepArr), "individuals will be removed due to high relatedness"
    return keepArr
Exemplo n.º 16
0
def loadRelatedFile(bed, relFile):
    relatedDict = phenoUtils.loadOnePhen(relFile, vectorize=True)
    checkIntersection(bed, relatedDict, 'relatedness', checkSuperSet=True)
    _, relatedDict = pstutil.intersect_apply([bed, relatedDict])
    related = relatedDict['vals']
    keepArr = (related < 0.5)
    print np.sum(
        ~keepArr), 'individuals will be removed due to high relatedness'
    return keepArr
Exemplo n.º 17
0
def main():
    """
    example that compares output to fastlmmc
    """

    # set up data
    phen_fn = "../feature_selection/examples/toydata.phe"
    snp_fn = "../feature_selection/examples/toydata.5chrom.bed"
    #chrom_count = 5

    # load data
    ###################################################################
    snp_reader = Bed(snp_fn)
    pheno = pstpheno.loadOnePhen(phen_fn)

    cov = None
    #cov = pstpheno.loadPhen(self.cov_fn)

    snp_reader, pheno, cov = intersect_apply([snp_reader, pheno, cov])

    G = snp_reader.read(order='C').val
    G = stdizer.Unit().standardize(G)
    G.flags.writeable = False
    y = pheno['vals'][:, 0]
    y.flags.writeable

    # load pcs
    #G_pc = cov['vals']
    #G_pc.flags.writeable = False
    delta = 2.0
    gwas = WindowingGwas(G, y, delta=delta)
    pv = gwas.run_gwas()

    from fastlmm.association.tests.test_gwas import GwasTest
    REML = False
    snp_pos_sim = snp_reader.sid
    snp_pos_test = snp_reader.sid
    os.environ["FastLmmUseAnyMklLib"] = "1"
    gwas_c = GwasTest(snp_fn,
                      phen_fn,
                      snp_pos_sim,
                      snp_pos_test,
                      delta,
                      REML=REML,
                      excludeByPosition=0)
    gwas_c.run_gwas()

    import pylab
    pylab.plot(np.log(pv), np.log(gwas_c.p_values), "+")
    pylab.plot(np.arange(-18, 0), np.arange(-18, 0), "-k")
    pylab.show()

    np.testing.assert_array_almost_equal(np.log(pv),
                                         np.log(gwas_c.p_values),
                                         decimal=3)

    simple_manhattan_plot(pv)
Exemplo n.º 18
0
def _fixup(test_snps, G, pheno, covar,count_A1=None):
    test_snps = _snps_fixup(test_snps,count_A1=count_A1)
    G = _snps_fixup(G or test_snps,count_A1=count_A1)
    pheno = _pheno_fixup(pheno,count_A1=count_A1).read()
    assert pheno.sid_count == 1, "Expect pheno to be just one variable"
    pheno = pheno[(pheno.val==pheno.val)[:,0],:]
    covar = _pheno_fixup(covar, iid_if_none=pheno.iid,count_A1=count_A1)
    G, test_snps, pheno, covar  = pstutil.intersect_apply([G, test_snps, pheno, covar])
    return test_snps, G, pheno, covar
Exemplo n.º 19
0
    def score(self,
              X=None,
              y=None,
              K0_whole_test=None,
              K1_whole_test=None,
              iid_if_none=None,
              return_mse_too=False,
              count_A1=None):
        """
        Method for calculating the negative log likelihood of testing examples.
        If the examples in X,y,  K0_whole_test, K1_whole_test are not the same, they will be reordered and intersected.

        :param X: testing covariate information, optional: 
          If you give a string, it should be the file name of a PLINK phenotype-formatted file.
        :type X: a PySnpTools `SnpReader <http://fastlmm.github.io/PySnpTools/#snpreader-snpreader>`__ (such as `Pheno <http://fastlmm.github.io/PySnpTools/#snpreader-pheno>`__ or `SnpData <http://fastlmm.github.io/PySnpTools/#snpreader-snpdata>`__) or string.

        :param y: testing phenotype:
          If you give a string, it should be the file name of a PLINK phenotype-formatted file.
        :type y: a PySnpTools `SnpReader <http://fastlmm.github.io/PySnpTools/#snpreader-snpreader>`__ (such as `Pheno <http://fastlmm.github.io/PySnpTools/#snpreader-pheno>`__ or `SnpData <http://fastlmm.github.io/PySnpTools/#snpreader-snpdata>`__) or string.

        :param K0_whole_test: Must be None. Represents the identity similarity matrix.
        :type K0_whole_test: None

        :param K1_whole_test: Must be None. Represents the identity similarity matrix.
        :type K1_whole_test: `SnpReader <http://fastlmm.github.io/PySnpTools/#snpreader-snpreader>`__ or a string or `KernelReader <http://fastlmm.github.io/PySnpTools/#kernelreader-kernelreader>`__

        :param iid_if_none: Examples to predict for if no X, K0_whole_test, K1_whole_test is provided.
        :type iid_if_none: an ndarray of two strings

        :param return_mse_too: If true, will also return the mean squared error.
        :type return_mse_too: bool

        :param count_A1: If it needs to read SNP data from a BED-formatted file, tells if it should count the number of A1
             alleles (the PLINK standard) or the number of A2 alleles. False is the current default, but in the future the default will change to True.
        :type count_A1: bool

        :rtype: a float of the negative log likelihood and, optionally, a float of the mean squared error.
        """
        with patch.dict('os.environ', {'ARRAY_MODULE': 'numpy'}) as _:
            mean0, covar0 = self.predict(K0_whole_test=K0_whole_test,
                                         K1_whole_test=K1_whole_test,
                                         X=X,
                                         iid_if_none=iid_if_none,
                                         count_A1=count_A1)
            y = _pheno_fixup(y, iid_if_none=covar0.iid, count_A1=count_A1)
            mean, covar, y = intersect_apply([mean0, covar0, y])
            var = multivariate_normal(
                mean=mean.read(order='A', view_ok=True).val.reshape(-1),
                cov=covar.read(order='A', view_ok=True).val)
            y_actual = y.read().val
            nll = -np.log(var.pdf(y_actual.reshape(-1)))
            if not return_mse_too:
                return nll
            else:
                mse = ((y_actual - mean)**2).sum()
                return nll, mse
Exemplo n.º 20
0
def _create_covar_chrom(covar, covar_by_chrom, chrom):
    if covar_by_chrom is not None:
        covar_by_chrom_chrom = covar_by_chrom[chrom]
        covar_by_chrom_chrom = _pheno_fixup(covar_by_chrom_chrom, iid_if_none=covar)
        covar_after,  covar_by_chrom_chrom = pstutil.intersect_apply([covar,  covar_by_chrom_chrom])
        ret = SnpData(iid=covar_after.iid,sid=np.r_[covar_after.sid,covar_by_chrom_chrom.sid],
                      val=np.c_[covar_after.read(order='A',view_ok=True).val,
                                covar_by_chrom_chrom.read(order='A',view_ok=True).val]) #view_ok because np.c_ will allocate new memory.
        return ret
    else:
        return covar
Exemplo n.º 21
0
def _create_covar_chrom(covar, covar_by_chrom, chrom,count_A1=None):
    if covar_by_chrom is not None:
        covar_by_chrom_chrom = covar_by_chrom[chrom]
        covar_by_chrom_chrom = _pheno_fixup(covar_by_chrom_chrom, iid_if_none=covar,count_A1=count_A1)
        covar_after,  covar_by_chrom_chrom = pstutil.intersect_apply([covar,  covar_by_chrom_chrom])
        ret = SnpData(iid=covar_after.iid,sid=np.r_[covar_after.sid,covar_by_chrom_chrom.sid],
                      val=np.c_[covar_after.read(order='A',view_ok=True).val,
                                covar_by_chrom_chrom.read(order='A',view_ok=True).val]) #view_ok because np.c_ will allocate new memory.
        return ret
    else:
        return covar
Exemplo n.º 22
0
def load_snp_data(snpreader, pheno_fn, cov_fn=None, offset=True, mpheno=0, standardizer=Unit()):
    """Load plink files
    ----------

    snpreader : snpreader object
        object to read in binary SNP file

    pheno_fn : str
        File name of phenotype file

    cov_fn : str
        File name of covariates file

    offset : bool, default=True
        Adds offset to the covariates specified in cov_fn, if neccesssary


    Returns
    -------
    G : array, shape = [n_samples, n_features]
        SNP matrix

    X : array, shape = [n_samples, n_covariates]
        Matrix of covariates (e.g. age, gender)

    y : array, shape = [n_samples]
        Phenotype (target) vector

    """
    
    #TODO: completely remove this
    pheno = pstpheno.loadOnePhen(pheno_fn,mpheno, vectorize=True)
    geno = snpreader.read(order='C').standardize(standardizer)

    # sanity check
    #assert np.testing.assert_array_equal(ind_iid, pheno['iid'][indarr[:,0]])

    # load covariates or generate vector of ones (for bias)
    if cov_fn == None:
        cov = {'vals': np.ones((len(pheno['iid']), 1)), 'iid':pheno['iid']}
    else:
        cov = pstpheno.loadPhen(cov_fn)

    (y, yiid), G, (X, xiid) = pstutil.intersect_apply([(pheno['vals'],pheno['iid']), geno, (cov['vals'],cov['iid'])], sort_by_dataset=False)
    G = G.read(order='C', view_ok=True)

    # add bias column if not present
    if offset and sp.all(X.std(0)!=0):
        offset = sp.ones((len(indarr),1))
        X = sp.hstack((X,offset))  
        
    return G, X, y
Exemplo n.º 23
0
def main():
    """
    example that compares output to fastlmmc
    """


    # set up data
    phen_fn = "../feature_selection/examples/toydata.phe"
    snp_fn = "../feature_selection/examples/toydata.5chrom"
    #chrom_count = 5
    
    # load data
    ###################################################################
    snp_reader = Bed(snp_fn)
    pheno = pstpheno.loadOnePhen(phen_fn)

    cov = None
    #cov = pstpheno.loadPhen(self.cov_fn)    

    snp_reader, pheno, cov = intersect_apply([snp_reader, pheno, cov])
    
    G = snp_reader.read(order='C').val
    G = stdizer.Unit().standardize(G)
    G.flags.writeable = False
    y = pheno['vals'][:,0]
    y.flags.writeable

    # load pcs
    #G_pc = cov['vals']
    #G_pc.flags.writeable = False
    delta = 2.0
    gwas = WindowingGwas(G, y, delta=delta)
    pv = gwas.run_gwas()

    from fastlmm.association.tests.test_gwas import GwasTest
    REML = False
    snp_pos_sim = snp_reader.sid
    snp_pos_test = snp_reader.sid
    os.environ["FastLmmUseAnyMklLib"] = "1"
    gwas_c = GwasTest(snp_fn, phen_fn, snp_pos_sim, snp_pos_test, delta, REML=REML, excludeByPosition=0)
    gwas_c.run_gwas()

    import pylab
    pylab.plot(np.log(pv), np.log(gwas_c.p_values), "+")
    pylab.plot(np.arange(-18, 0), np.arange(-18,0), "-k")
    pylab.show()

    np.testing.assert_array_almost_equal(np.log(pv), np.log(gwas_c.p_values), decimal=3)
    
    simple_manhattan_plot(pv)
Exemplo n.º 24
0
    def score(self,
              X=None,
              y=None,
              K0_whole_test=None,
              K1_whole_test=None,
              iid_if_none=None,
              return_mse_too=False):
        """
        Method for calculating the negative log likelihood of testing examples.
        If the examples in X,y,  K0_whole_test, K1_whole_test are not the same, they will be reordered and intersected.

        :param X: testing covariate information, optional: 
          If you give a string, it should be the file name of a PLINK phenotype-formatted file.
        :type X: a PySnpTools :class:`SnpReader` (such as :class:`Pheno` or :class:`SnpData`) or string.

        :param y: testing phenotype:
          If you give a string, it should be the file name of a PLINK phenotype-formatted file.
        :type y: a PySnpTools :class:`SnpReader` (such as :class:`Pheno` or :class:`SnpData`) or string.

        :param K0_whole_test: Must be None. Represents the identity similarity matrix.
        :type K0_whole_test: None

        :param K1_whole_test: Must be None. Represents the identity similarity matrix.
        :type K1_whole_test: :class:`.SnpReader` or a string or :class:`.KernelReader`

        :param iid_if_none: Examples to predict for if no X, K0_whole_test, K1_whole_test is provided.
        :type iid_if_none: an ndarray of two strings

        :param return_mse_too: If true, will also return the mean squared error.
        :type return_mse_too: bool

        :rtype: a float of the negative log likelihood and, optionally, a float of the mean squared error.
        """
        mean0, covar0 = self.predict(K0_whole_test=K0_whole_test,
                                     K1_whole_test=K1_whole_test,
                                     X=X,
                                     iid_if_none=iid_if_none)
        y = _pheno_fixup(y, iid_if_none=covar0.iid)
        mean, covar, y = intersect_apply([mean0, covar0, y])
        var = multivariate_normal(mean=mean.read(order='A',
                                                 view_ok=True).val.reshape(-1),
                                  cov=covar.read(order='A', view_ok=True).val)
        y_actual = y.read().val
        nll = -np.log(var.pdf(y_actual.reshape(-1)))
        if not return_mse_too:
            return nll
        else:
            mse = ((y_actual - mean)**2).sum()
            return nll, mse
Exemplo n.º 25
0
def _create_covar_chrom(covar, covar_by_chrom, chrom):
    if covar_by_chrom is not None:
        covar_by_chrom_chrom = covar_by_chrom[chrom]
        covar_by_chrom_chrom = _pheno_fixup(covar_by_chrom_chrom, iid_source_if_none=covar)
        covar_after,  covar_by_chrom_chrom = pstutil.intersect_apply([covar,  covar_by_chrom_chrom])
        assert np.all(covar_after['iid'] == covar['iid']), "covar_by_chrom must contain all iids found in the intersection of the other datasets"

        ret = {
        'header':covar['header']+covar_by_chrom_chrom['header'],
        'vals': np.hstack([covar['vals'],covar_by_chrom_chrom['vals']]),
        'iid':covar['iid']
        }
        return ret
    else:
        return covar
Exemplo n.º 26
0
    def score(self, X=None, y=None, K0_whole_test=None, K1_whole_test=None, iid_if_none=None, return_mse_too=False):
        """
        Method for calculating the negative log likelihood of testing examples.
        If the examples in X,y,  K0_whole_test, K1_whole_test are not the same, they will be reordered and intersected.

        :param X: testing covariate information, optional: 
          If you give a string, it should be the file name of a PLINK phenotype-formatted file.
        :type X: a PySnpTools :class:`SnpReader` (such as :class:`Pheno` or :class:`SnpData`) or string.

        :param y: testing phenotype:
          If you give a string, it should be the file name of a PLINK phenotype-formatted file.
        :type y: a PySnpTools :class:`SnpReader` (such as :class:`Pheno` or :class:`SnpData`) or string.

        :param K0_whole_test: A similarity matrix from all the examples to the test examples. Alternatively,
               the test SNPs needed to construct such a similarity matrix.
               Can be any :class:`.SnpReader`. If you give a string, can be the name of a PLINK-formated Bed file.
               Can be PySnpTools :class:`.KernelReader`. If you give a string it can be the name of a :class:`.KernelNpz` file.
        :type K0_whole_test: :class:`.SnpReader` or a string or :class:`.KernelReader`

        :param K1_whole_test: A second similarity matrix from all the examples to the test examples. Alternatively,
               the test SNPs needed to construct such a similarity matrix.
               Can be any :class:`.SnpReader`. If you give a string, can be the name of a PLINK-formated Bed file.
               Can be PySnpTools :class:`.KernelReader`. If you give a string it can be the name of a :class:`.KernelNpz` file.
        :type K1_whole_test: :class:`.SnpReader` or a string or :class:`.KernelReader`

        :param iid_if_none: Examples to predict for if no X, K0_whole_test, K1_whole_test is provided.
        :type iid_if_none: an ndarray of two strings

        :param return_mse_too: If true, will also return the mean squared error.
        :type return_mse_too: bool

        :rtype: a float of the negative log likelihood and, optionally, a float of the mean squared error.
        """
        mean0, covar0 = self.predict(K0_whole_test=K0_whole_test,K1_whole_test=K1_whole_test,X=X,iid_if_none=iid_if_none)
        y = _pheno_fixup(y, iid_if_none=covar0.iid)
        mean, covar, y = intersect_apply([mean0, covar0, y])
        mean = mean.read(order='A',view_ok=True).val
        covar = covar.read(order='A',view_ok=True).val
        var = multivariate_normal(mean=mean.reshape(-1), cov=covar)
        y_actual = y.read().val
        nll = -np.log(var.pdf(y_actual.reshape(-1)))
        if not return_mse_too:
            return nll
        else:
            mse = ((y_actual-mean)**2).sum()
            return nll, mse
Exemplo n.º 27
0
def _create_covar_chrom(covar, covar_by_chrom, chrom):
    if covar_by_chrom is not None:
        covar_by_chrom_chrom = covar_by_chrom[chrom]
        covar_by_chrom_chrom = _pheno_fixup(covar_by_chrom_chrom,
                                            iid_source_if_none=covar)
        covar_after, covar_by_chrom_chrom = pstutil.intersect_apply(
            [covar, covar_by_chrom_chrom])
        assert np.all(
            covar_after['iid'] == covar['iid']
        ), "covar_by_chrom must contain all iids found in the intersection of the other datasets"

        ret = {
            'header': covar['header'] + covar_by_chrom_chrom['header'],
            'vals': np.hstack([covar['vals'], covar_by_chrom_chrom['vals']]),
            'iid': covar['iid']
        }
        return ret
    else:
        return covar
Exemplo n.º 28
0
        def nested_closure(chrom):
            test_snps_chrom = test_snps[:,test_snps.pos[:,0]==chrom]
            covar_chrom = _create_covar_chrom(covar, covar_by_chrom, chrom)

            K0_chrom = _K_per_chrom(K0 or G0 or test_snps, chrom, test_snps.iid)
            K1_chrom = _K_per_chrom(K1 or G1, chrom, test_snps.iid)

            K0_chrom, K1_chrom, test_snps_chrom, pheno_chrom, covar_chrom  = pstutil.intersect_apply([K0_chrom, K1_chrom, test_snps_chrom, pheno, covar_chrom])
            logging.debug("# of iids now {0}".format(K0_chrom.iid_count))
            K0_chrom, K1_chrom, block_size = _set_block_size(K0_chrom, K1_chrom, mixing, GB_goal, force_full_rank, force_low_rank)

            distributable = _internal_single(K0=K0_chrom, test_snps=test_snps_chrom, pheno=pheno_chrom,
                                        covar=covar_chrom, K1=K1_chrom,
                                        mixing=mixing, h2=h2, log_delta=log_delta, cache_file=None,
                                        force_full_rank=force_full_rank,force_low_rank=force_low_rank,
                                        output_file_name=None, block_size=block_size, interact_with_snp=interact_with_snp,
                                        runner=Local())
            
            return distributable
Exemplo n.º 29
0
    def score(self, X=None, y=None, K0_whole_test=None, K1_whole_test=None, iid_if_none=None, return_mse_too=False, count_A1=None):
        """
        Method for calculating the negative log likelihood of testing examples.
        If the examples in X,y,  K0_whole_test, K1_whole_test are not the same, they will be reordered and intersected.

        :param X: testing covariate information, optional: 
          If you give a string, it should be the file name of a PLINK phenotype-formatted file.
        :type X: a PySnpTools :class:`SnpReader` (such as :class:`Pheno` or :class:`SnpData`) or string.

        :param y: testing phenotype:
          If you give a string, it should be the file name of a PLINK phenotype-formatted file.
        :type y: a PySnpTools :class:`SnpReader` (such as :class:`Pheno` or :class:`SnpData`) or string.

        :param K0_whole_test: Must be None. Represents the identity similarity matrix.
        :type K0_whole_test: None

        :param K1_whole_test: Must be None. Represents the identity similarity matrix.
        :type K1_whole_test: :class:`.SnpReader` or a string or :class:`.KernelReader`

        :param iid_if_none: Examples to predict for if no X, K0_whole_test, K1_whole_test is provided.
        :type iid_if_none: an ndarray of two strings

        :param return_mse_too: If true, will also return the mean squared error.
        :type return_mse_too: bool

        :param count_A1: If it needs to read SNP data from a BED-formatted file, tells if it should count the number of A1
             alleles (the PLINK standard) or the number of A2 alleles. False is the current default, but in the future the default will change to True.
        :type count_A1: bool

        :rtype: a float of the negative log likelihood and, optionally, a float of the mean squared error.
        """
        mean0, covar0 = self.predict(K0_whole_test=K0_whole_test,K1_whole_test=K1_whole_test,X=X,iid_if_none=iid_if_none,count_A1=count_A1)
        y = _pheno_fixup(y, iid_if_none=covar0.iid,count_A1=count_A1)
        mean, covar, y = intersect_apply([mean0, covar0, y])
        var = multivariate_normal(mean=mean.read(order='A',view_ok=True).val.reshape(-1), cov=covar.read(order='A',view_ok=True).val)
        y_actual = y.read().val
        nll = -np.log(var.pdf(y_actual.reshape(-1)))
        if not return_mse_too:
            return nll
        else:
            mse = ((y_actual-mean)**2).sum()
            return nll, mse
Exemplo n.º 30
0
    def test_intersection_Snp2Dist(self):
        from pysnptools.distreader._snp2dist import _Snp2Dist
        from pysnptools.snpreader import Pheno, Bed
        from pysnptools.distreader._subset import _DistSubset
        from pysnptools.snpreader._subset import _SnpSubset
        from pysnptools.util import intersect_apply

        snp_all = Bed(self.currentFolder + "/../examples/toydata.5chrom.bed",count_A1=True)
        k = snp_all.as_dist(max_weight=2)

        pheno = Pheno(self.currentFolder + "/../examples/toydata.phe")
        pheno = pheno[1:,:] # To test intersection we remove a iid from pheno

        k1,pheno = intersect_apply([k,pheno]) 
        assert isinstance(k1.snpreader,_SnpSubset) and not isinstance(k1,_DistSubset)

        #What happens with fancy selection?
        k2 = k[::2,:]
        assert isinstance(k2,_Snp2Dist)

        logging.info("Done with test_intersection")
Exemplo n.º 31
0
    def test_intersection_Dist2Snp(self):
        from pysnptools.snpreader._dist2snp import _Dist2Snp
        from pysnptools.snpreader import Pheno
        from pysnptools.distreader._subset import _DistSubset
        from pysnptools.snpreader._subset import _SnpSubset
        from pysnptools.util import intersect_apply

        dist_all = DistNpz(self.currentFolder + "/../examples/toydata.dist.npz")
        k = dist_all.as_snp(max_weight=25)

        pheno = Pheno(self.currentFolder + "/../examples/toydata.phe")
        pheno = pheno[1:,:] # To test intersection we remove a iid from pheno

        k1,pheno = intersect_apply([k,pheno]) 
        assert isinstance(k1.distreader,_DistSubset) and not isinstance(k1,_SnpSubset)

        #What happens with fancy selection?
        k2 = k[::2,:]
        assert isinstance(k2,_Dist2Snp)

        logging.info("Done with test_intersection")
Exemplo n.º 32
0
    def test_intersection(self):

        from pysnptools.standardizer import Unit
        from pysnptools.kernelreader import SnpKernel
        from pysnptools.snpreader import Pheno
        from pysnptools.kernelreader._subset import _KernelSubset
        from pysnptools.snpreader._subset import _SnpSubset
        from pysnptools.util import intersect_apply

        snps_all = Bed(self.currentFolder + "/../examples/toydata",count_A1=False)
        k = SnpKernel(snps_all,stdizer.Identity())

        pheno = Pheno(self.currentFolder + "/../examples/toydata.phe")
        pheno = pheno[1:,:] # To test intersection we remove a iid from pheno

        k1,pheno = intersect_apply([k,pheno]) #SnpKernel is special because it standardizes AFTER intersecting.
        assert isinstance(k1.snpreader,_SnpSubset) and not isinstance(k1,_KernelSubset)

        #What happens with fancy selection?
        k2 = k[::2]
        assert isinstance(k2,SnpKernel)

        logging.info("Done with test_intersection")
Exemplo n.º 33
0
    def predict(self,X=None,K0_whole_test=None,K1_whole_test=None,iid_if_none=None):
        """
        Method for predicting from a fitted :class:`FastLMM` predictor.
        If the examples in X, K0_whole_test, K1_whole_test are not the same, they will be reordered and intersected.

        :param X: testing covariate information, optional: 
          If you give a string, it should be the file name of a PLINK phenotype-formatted file.
        :type X: a PySnpTools :class:`SnpReader` (such as :class:`Pheno` or :class:`SnpData`) or string.

        :param K0_whole_test: A similarity matrix from all the examples to the test examples. Alternatively,
               the test SNPs needed to construct such a similarity matrix.
               Can be any :class:`.SnpReader`. If you give a string, can be the name of a PLINK-formated Bed file.
               Can be PySnpTools :class:`.KernelReader`. If you give a string it can be the name of a :class:`.KernelNpz` file.
        :type K0_whole_test: :class:`.SnpReader` or a string or :class:`.KernelReader`

        :param K1_whole_test: A second similarity matrix from all the examples to the test examples. Alternatively,
               the test SNPs needed to construct such a similarity matrix.
               Can be any :class:`.SnpReader`. If you give a string, can be the name of a PLINK-formated Bed file.
               Can be PySnpTools :class:`.KernelReader`. If you give a string it can be the name of a :class:`.KernelNpz` file.
        :type K1_whole_test: :class:`.SnpReader` or a string or :class:`.KernelReader`

        :param iid_if_none: Examples to predict for if no X, K0_whole_test, K1_whole_test is provided.
        :type iid_if_none: an ndarray of two strings

        :rtype: A :class:`SnpData` of the means and a :class:`KernelData` of the covariance
        """

        assert self.is_fitted, "Can only predict after predictor has been fitted"
        #assert K0_whole_test is not None, "K0_whole_test must be given"
        #!!!later is it too wasteful to keep both G0_train, G1_train, and lmm.G when storing to disk?
        #!!!later all _kernel_fixup's should use block_size input

        K0_whole_test_b = _kernel_fixup(K0_whole_test, train_snps=self.G0_train, iid_if_none=iid_if_none, standardizer=self.mixer.snp_trained0, test=K0_whole_test, test_iid_if_none=None, block_size=self.block_size)
        K1_whole_test = _kernel_fixup(K1_whole_test, train_snps=self.G1_train, iid_if_none=K0_whole_test_b.iid0, standardizer=self.mixer.snp_trained1, test=K1_whole_test, test_iid_if_none=K0_whole_test_b.iid1, block_size=self.block_size)
        X = _pheno_fixup(X,iid_if_none=K0_whole_test_b.iid1)
        K0_whole_test_c, K1_whole_test, X = intersect_apply([K0_whole_test_b, K1_whole_test, X],intersect_before_standardize=True,is_test=True)
        X = X.read().standardize(self.covar_unit_trained)
        # add a column of 1's to cov to increase DOF of model (and accuracy) by allowing a constant offset
        X = SnpData(iid=X.iid,
                              sid=self._new_snp_name(X),
                              val=np.c_[X.read().val,np.ones((X.iid_count,1))])
        assert np.array_equal(X.sid,self.covar_sid), "Expect covar sids to be the same in train and test."

        train_idx0 = K0_whole_test_c.iid0_to_index(self.K_train_iid)
        K0_train_test = K0_whole_test_c[train_idx0,:]
        train_idx1 = K1_whole_test.iid0_to_index(self.K_train_iid)
        K1_train_test = K1_whole_test[train_idx1,:]
        test_idx0 = K0_whole_test_c.iid0_to_index(K0_whole_test_c.iid1)
        K0_test_test = K0_whole_test_c[test_idx0,:]
        if K0_test_test.iid0 is not K0_test_test.iid1:
            raise Exception("real assert")
        test_idx1 = K1_whole_test.iid0_to_index(K0_whole_test_c.iid1)
        K1_test_test = K1_whole_test[test_idx1,:]

        if self.mixer.do_g:
            ###################################################
            # low rank from Rasmussen  eq 2.9 + noise term added to covar
            ###################################################
            Gstar = self.mixer.g_mix(K0_train_test,K1_train_test)
            varg = self.h2 * self.sigma2
            vare = (1.-self.h2) * self.sigma2
            Ainv = LA.inv((1./vare) * np.dot(self.G.T,self.G) + (1./varg)*np.eye(self.G.shape[1]))
            testAinv = np.dot(Gstar.test.val, Ainv)
            pheno_predicted = np.dot(X.val,self.beta) + (1./vare) * np.dot(np.dot(testAinv,self.G.T),self.y-np.dot(self.X,self.beta))
            pheno_predicted = pheno_predicted.reshape(-1,1)
            covar  = np.dot(testAinv,Gstar.test.val.T) + vare * np.eye(Gstar.test.val.shape[0])

        else:
            lmm = LMM()
            lmm.U = self.U
            lmm.S = self.S
            lmm.G = self.G
            lmm.y = self.y
            lmm.Uy = self.Uy
            lmm.X = self.X
            lmm.UX = self.UX

            Kstar = self.mixer.k_mix(K0_train_test,K1_train_test) #!!!later do we need/want reads here? how about view_OK?
            lmm.setTestData(Xstar=X.val, K0star=Kstar.val.T)

            Kstar_star = self.mixer.k_mix(K0_test_test,K1_test_test) #!!!later do we need/want reads here?how about view_OK?
            pheno_predicted, covar = lmm.predict_mean_and_variance(beta=self.beta, h2=self.h2,sigma2=self.sigma2, Kstar_star=Kstar_star.val)

        #pheno_predicted = lmm.predictMean(beta=self.beta, h2=self.h2,scale=self.sigma2).reshape(-1,1)
        ret0 = SnpData(iid = X.iid, sid=self.pheno_sid,val=pheno_predicted,pos=np.array([[np.nan,np.nan,np.nan]]),name="lmm Prediction")

        from pysnptools.kernelreader import KernelData
        ret1 = KernelData(iid=K0_test_test.iid,val=covar)
        return ret0, ret1
Exemplo n.º 34
0
def single_snp(test_snps,pheno,
                 G0=None, G1=None, mixing=None,
                 covar=None, output_file_name=None, h2=None, log_delta=None,
                 cache_file = None):
    """
    Function performing single SNP GWAS with REML

    :param test_snps: SNPs to test. If you give a string, it should be the base name of a set of PLINK Bed-formatted files.
    :type test_snps: a :class:`.SnpReader` or a string

    :param pheno: A single phenotype: A 'pheno dictionary' contains an ndarray on the 'vals' key and a iid list on the 'iid' key.
      If you give a string, it should be the file name of a PLINK phenotype-formatted file.
    :type pheno: a 'pheno dictionary' or a string

    :param G0: SNPs from which to construct a similarity matrix.
          If you give a string, it should be the base name of a set of PLINK Bed-formatted files.
    :type G0: a :class:`.SnpReader` or a string

    :param G1: SNPs from which to construct a second similarity kernel, optional. Also, see 'mixing').
          If you give a string, it should be the base name of a set of PLINK Bed-formatted files.
    :type G1: a :class:`.SnpReader` or a string

    :param mixing: Weight between 0.0 (inclusive, default) and 1.0 (inclusive) given to G1 relative to G0.
            If you give no mixing number and a G1 is given, the best weight will be learned.
    :type mixing: number

    :param covar: covariate information, optional: A 'pheno dictionary' contains an ndarray on the 'vals' key and a iid list on the 'iid' key.
      If you give a string, it should be the file name of a PLINK phenotype-formatted file.
    :type covar: a 'pheno dictionary' or a string

    :param output_file_name: Name of file to write results to, optional. If not given, no output file will be created.
    :type output_file_name: file name

    :param h2: A parameter to LMM learning, optional
            If not given will search for best value.
            If mixing is unspecified, then h2 must also be unspecified.
    :type h2: number

    :param log_delta: a re-parameterization of h2 provided for backwards compatibility.
    :type log_delta: number


    :param cache_file: Name of  file to read or write cached precomputation values to, optional.
                If not given, no cache file will be used.
                If given and file does not exists, will write precomputation values to file.
                If given and file does exists, will read precomputation values from file.
                The file contains the U and S matrix from the decomposition of the training matrix. It is in Python's np.savez (*.npz) format.
                Calls using the same cache file should have the same 'G0' and 'G1'
                If given and the file does exist then G0 and G1 need not be given.
    :type cache_file: file name





    :rtype: Pandas dataframe with one row per test SNP. Columns include "PValue"

    :Example:

    >>> import logging
    >>> import numpy as np
    >>> from fastlmm.association import single_snp
    >>> from pysnptools.snpreader import Bed
    >>> logging.basicConfig(level=logging.INFO)
    >>> snpreader = Bed("../feature_selection/examples/toydata")
    >>> pheno_fn = "../feature_selection/examples/toydata.phe"
    >>> results_dataframe = single_snp(test_snps=snpreader[:,5000:10000],pheno=pheno_fn,G0=snpreader[:,0:5000],h2=.2,mixing=0)
    >>> print results_dataframe.iloc[0].SNP,round(results_dataframe.iloc[0].PValue,7),len(results_dataframe)
    null_7487 3.4e-06 5000

    """
    t0 = time.time()
    test_snps = _snp_fixup(test_snps)
    pheno = _pheno_fixup(pheno)
    covar = _pheno_fixup(covar, iid_source_if_none=pheno)

    if G0 is not None or G1 is not None:
        G0 = _snp_fixup(G0)
        G1 = _snp_fixup(G1, iid_source_if_none=G0)
        G0, G1, test_snps, pheno, covar,  = pstutil.intersect_apply([G0, G1, test_snps, pheno, covar])
        G0_standardized = G0.read().standardize()
        G1_standardized = G1.read().standardize()
    else:
        test_snps, pheno, covar,  = pstutil.intersect_apply([test_snps, pheno, covar])
        G0_standardized, G1_standardized = None, None


    frame =  _internal_single(G0_standardized=G0_standardized, test_snps=test_snps, pheno=pheno,
                                covar=covar, G1_standardized=G1_standardized, 
                                mixing=mixing, h2=h2, log_delta=log_delta,
                                cache_file = cache_file)

    frame.sort("PValue", inplace=True)
    frame.index = np.arange(len(frame))


    if output_file_name is not None:
        frame.to_csv(output_file_name, sep="\t", index=False)

    logging.info("PhenotypeName\t{0}".format(pheno['header']))
    if G0 is not None:
        logging.info("SampleSize\t{0}".format(G0.iid_count))
        logging.info("SNPCount\t{0}".format(G0.sid_count))
    logging.info("Runtime\t{0}".format(time.time()-t0))


    return frame
Exemplo n.º 35
0
def single_snp_linreg(test_snps,
                      pheno,
                      covar=None,
                      max_output_len=None,
                      output_file_name=None,
                      GB_goal=None,
                      runner=None,
                      count_A1=None):
    """
    Function performing single SNP GWAS using linear regression. Will reorder and intersect IIDs as needed.

    :param test_snps: SNPs to test. Can be any `SnpReader <http://fastlmm.github.io/PySnpTools/#snpreader-snpreader>`__. If you give a string, it should be the base name of a set of PLINK Bed-formatted files.
           (For backwards compatibility can also be dictionary with keys 'vals', 'iid', 'header')
    :type test_snps: a `SnpReader <http://fastlmm.github.io/PySnpTools/#snpreader-snpreader>`__ or a string

    :param pheno: A single phenotype: Can be any `SnpReader <http://fastlmm.github.io/PySnpTools/#snpreader-snpreader>`__, for example, `Pheno <http://fastlmm.github.io/PySnpTools/#snpreader-pheno>`__ or `SnpData <http://fastlmm.github.io/PySnpTools/#snpreader-snpdata>`__.
           If you give a string, it should be the file name of a PLINK phenotype-formatted file.
           Any IIDs with missing values will be removed.
           (For backwards compatibility can also be dictionary with keys 'vals', 'iid', 'header')
    :type pheno: a `SnpReader <http://fastlmm.github.io/PySnpTools/#snpreader-snpreader>`__ or a string

    :param covar: covariate information, optional: Can be any `SnpReader <http://fastlmm.github.io/PySnpTools/#snpreader-snpreader>`__, for example, `Pheno <http://fastlmm.github.io/PySnpTools/#snpreader-pheno>`__ or `SnpData <http://fastlmm.github.io/PySnpTools/#snpreader-snpdata>`__.
           If you give a string, it should be the file name of a PLINK phenotype-formatted file.
           (For backwards compatibility can also be dictionary with keys 'vals', 'iid', 'header')
    :type covar: a `SnpReader <http://fastlmm.github.io/PySnpTools/#snpreader-snpreader>`__ or a string


    :param max_output_len: Maximum number of Pvalues to return. Default to None, which means 'Return all'.
    :type max_output_len: number
    
    :param output_file_name: Name of file to write results to, optional. If not given, no output file will be created. The output format is tab-delimited text.
    :type output_file_name: file name

    :param GB_goal: gigabytes of memory the run should use, optional. If not given, will read the test_snps in blocks of size iid_count,
        which is memory efficient with little overhead on computation time.
    :type GB_goal: number

    :param runner: `Runner <http://fastlmm.github.io/PySnpTools/#util-mapreduce1-runner-runner>`__, optional: Tells how to run locally, multi-processor, or on a cluster.
        If not given, the function is run locally.
    :type runner: `Runner <http://fastlmm.github.io/PySnpTools/#util-mapreduce1-runner-runner>`__

    :param count_A1: If it needs to read SNP data from a BED-formatted file, tells if it should count the number of A1
         alleles (the PLINK standard) or the number of A2 alleles. False is the current default, but in the future the default will change to True.
    :type count_A1: bool

    :rtype: Pandas dataframe with one row per test SNP. Columns include "PValue"


    :Example:

    >>> import logging
    >>> import numpy as np
    >>> from fastlmm.association import single_snp_linreg
    >>> from pysnptools.snpreader import Bed
    >>> from fastlmm.util import example_file # Download and return local file name
    >>> logging.basicConfig(level=logging.INFO)
    >>> pheno_fn = example_file("fastlmm/feature_selection/examples/toydata.phe")
    >>> test_snps = example_file("fastlmm/feature_selection/examples/toydata.5chrom.*","*.bed")
    >>> results_dataframe = single_snp_linreg(test_snps=test_snps, pheno=pheno_fn, count_A1=False)
    >>> print(results_dataframe.iloc[0].SNP,round(results_dataframe.iloc[0].PValue,7),len(results_dataframe))
    null_576 1e-07 10000


    """
    with patch.dict('os.environ', {'ARRAY_MODULE': 'numpy'}) as _:

        assert test_snps is not None, "test_snps must be given as input"
        test_snps = _snps_fixup(test_snps, count_A1=count_A1)
        pheno = _pheno_fixup(pheno, count_A1=count_A1).read()
        assert pheno.sid_count == 1, "Expect pheno to be just one variable"
        pheno = pheno[(pheno.val == pheno.val)[:, 0], :]
        covar = _pheno_fixup(covar, iid_if_none=pheno.iid)
        test_snps, pheno, covar = pstutil.intersect_apply(
            [test_snps, pheno, covar])
        logging.debug("# of iids now {0}".format(test_snps.iid_count))

        if GB_goal is not None:
            bytes_per_sid = test_snps.iid_count * 8
            sid_per_GB_goal = 1024.0**3 * GB_goal / bytes_per_sid
            block_size = max(1, int(sid_per_GB_goal + .5))
            block_count = test_snps.sid_count / block_size
        else:
            block_count = 1
            block_size = test_snps.sid_count
        logging.debug("block_count={0}, block_size={1}".format(
            block_count, block_size))

        #!!!what about missing data in covar, in test_snps, in y
        covar = np.c_[
            covar.read(view_ok=True, order='A').val,
            np.ones((test_snps.iid_count,
                     1))]  #view_ok because np.c_ will allocation new memory
        y = pheno.read(
            view_ok=True, order='A'
        ).val  #view_ok because this code already did a fresh read to look for any missing values

        def mapper(start):
            logging.info(
                "single_snp_linereg reading start={0},block_size={1}".format(
                    start, block_size))
            snp_index = np.arange(start,
                                  min(start + block_size, test_snps.sid_count))
            x = test_snps[:, start:start + block_size].read().standardize().val
            logging.info("single_snp_linereg linreg")
            _, pval_in = lin_reg.f_regression_cov_alt(x, y, covar)
            logging.info("single_snp_linereg done")
            pval_in = pval_in.reshape(-1)

            if max_output_len is None:
                return pval_in, snp_index
            else:  #We only need to return the top max_output_len results
                sort_index = np.argsort(pval_in)[:max_output_len]
                return pval_in[sort_index], snp_index[sort_index]

        def reducer(pval_and_snp_index_sequence):
            pval_list = []
            snp_index_list = []
            for pval, snp_index in pval_and_snp_index_sequence:
                pval_list.append(pval)
                snp_index_list.append(snp_index)
            pval = np.concatenate(pval_list)
            snp_index = np.concatenate(snp_index_list)
            sort_index = np.argsort(pval)
            if max_output_len is not None:
                sort_index = sort_index[:max_output_len]
            index = snp_index[sort_index]

            dataframe = pd.DataFrame(index=np.arange(len(index)),
                                     columns=('sid_index', 'SNP', 'Chr',
                                              'GenDist', 'ChrPos', 'PValue'))
            #!!Is this the only way to set types in a dataframe?
            dataframe['sid_index'] = dataframe['sid_index'].astype(np.float)
            dataframe['Chr'] = dataframe['Chr'].astype(np.float)
            dataframe['GenDist'] = dataframe['GenDist'].astype(np.float)
            dataframe['ChrPos'] = dataframe['ChrPos'].astype(np.float)
            dataframe['PValue'] = dataframe['PValue'].astype(np.float)

            dataframe['sid_index'] = index
            dataframe['SNP'] = np.array(
                test_snps.sid[index], dtype='str'
            )  #This will be ascii on Python2 and unicode on Python3
            dataframe['Chr'] = test_snps.pos[index, 0]
            dataframe['GenDist'] = test_snps.pos[index, 1]
            dataframe['ChrPos'] = test_snps.pos[index, 2]
            dataframe['PValue'] = pval[sort_index]

            if output_file_name is not None:
                dataframe.to_csv(output_file_name, sep="\t", index=False)

            return dataframe

        dataframe = map_reduce(range(0, test_snps.sid_count, block_size),
                               mapper=mapper,
                               reducer=reducer,
                               input_files=[test_snps, pheno, covar],
                               output_files=[output_file_name],
                               name="single_snp_linreg",
                               runner=runner)
        return dataframe
Exemplo n.º 36
0
def loadCovars(bed, covarFile):
    covarsDict = phenoUtils.loadPhen(covarFile)
    checkIntersection(bed, covarsDict, 'covariates', checkSuperSet=True)
    _, covarsDict = pstutil.intersect_apply([bed, covarsDict])
    covar = covarsDict['vals']
    return covar
Exemplo n.º 37
0
    def test_old(self):
        do_plot = False
        from fastlmm.feature_selection.feature_selection_two_kernel import FeatureSelectionInSample
        from pysnptools.util import intersect_apply

        logging.info("TestSingleSnpAllPlusSelect test_old")

        bed_fn = self.pythonpath + "/tests/datasets/synth/all.bed"
        pheno_fn = self.pythonpath + "/tests/datasets/synth/pheno_10_causals.txt"
        cov_fn = self.pythonpath + "/tests/datasets/synth/cov.txt"

        #load data
        ###################################################################
        snp_reader = Bed(bed_fn, count_A1=False)
        pheno = Pheno(pheno_fn)
        cov = Pheno(cov_fn)

        # intersect sample ids
        snp_reader, pheno, cov = intersect_apply([snp_reader, pheno, cov])

        # read in snps

        # partition snps on chr5 vs rest
        test_chr = 5
        G0 = snp_reader[:, snp_reader.pos[:, 0] != test_chr].read(
            order='C').standardize()
        test_snps = snp_reader[:, snp_reader.pos[:, 0] == test_chr].read(
            order='C').standardize()

        y = pheno.read().val[:, 0]
        y -= y.mean()
        y /= y.std()

        # load covariates
        X_cov = cov.read().val
        X_cov.flags.writeable = False

        # invoke feature selection to learn which SNPs to use to build G1
        logging.info(
            "running feature selection conditioned on background kernel")
        # partition data into the first 50 SNPs on chr1 and all but chr1

        select = FeatureSelectionInSample(max_log_k=7,
                                          n_folds=7,
                                          order_by_lmm=True,
                                          measure="ll",
                                          random_state=42)
        best_k, feat_idx, best_mix, best_delta = select.run_select(G0.val,
                                                                   G0.val,
                                                                   y,
                                                                   cov=X_cov)

        # plot out of sample error
        if do_plot: select.plot_results(measure="ll")
        # select.plot_results(measure="mse")

        # print results
        logging.info("best_k:{0}".format(best_k))
        logging.info("best_mix:{0}".format(best_mix))
        logging.info("best_delta:{0}".format(best_delta))

        ###############################
        # use selected SNPs to build G1
        logging.info(feat_idx)
        G1 = G0[:, feat_idx]

        output_file_name = self.file_name("old")
        results_df = single_snp(test_snps,
                                pheno,
                                G0=G0,
                                G1=G1,
                                mixing=best_mix,
                                h2=None,
                                leave_out_one_chrom=False,
                                output_file_name=output_file_name,
                                count_A1=False)

        logging.info("results:")
        logging.info("#" * 40)
        logging.info(results_df.head())
        self.compare_files(results_df, "old")
Exemplo n.º 38
0
    def fit(self,
            X=None,
            y=None,
            K0_train=None,
            K1_train=None,
            h2raw=None,
            mixing=None,
            count_A1=None):  #!!!is this h2 or h2corr????
        """
        Method for training a :class:`FastLMM` predictor. If the examples in X, y, K0_train, K1_train are not the same, they will be reordered and intersected.

        :param X: training covariate information, optional: 
          If you give a string, it should be the file name of a PLINK phenotype-formatted file.
        :type X: a PySnpTools `SnpReader <http://fastlmm.github.io/PySnpTools/#snpreader-snpreader>`__
          (such as `Pheno <http://fastlmm.github.io/PySnpTools/#snpreader-pheno>`__ or `SnpData <http://fastlmm.github.io/PySnpTools/#snpreader-snpdata>`__) or string.

        :param y: training phenotype:
          If you give a string, it should be the file name of a PLINK phenotype-formatted file.
        :type y: a PySnpTools `SnpReader <http://fastlmm.github.io/PySnpTools/#snpreader-snpreader>`__ 
          (such as `Pheno <http://fastlmm.github.io/PySnpTools/#snpreader-pheno>`__ or `SnpData <http://fastlmm.github.io/PySnpTools/#snpreader-snpdata>`__) or string.

        :param K0_train: A similarity matrix or SNPs from which to construct such a similarity matrix.
               Can be any `SnpReader <http://fastlmm.github.io/PySnpTools/#snpreader-snpreader>`__.
               If you give a string, can be the name of a PLINK-formated Bed file.
               Can be PySnpTools `KernelReader <http://fastlmm.github.io/PySnpTools/#kernelreader-kernelreader>`__.
               If you give a string it can be the name of a `KernelNpz <http://fastlmm.github.io/PySnpTools/#kernelreader-kernelnpz>`__ file.
        :type K0_train: `SnpReader <http://fastlmm.github.io/PySnpTools/#snpreader-snpreader>`__ or a string or
               `KernelReader <http://fastlmm.github.io/PySnpTools/#kernelreader-kernelreader>`__

        :param K1_train: A second similarity matrix or SNPs from which to construct such a second similarity matrix. (Also, see 'mixing').
               Can be any `SnpReader <http://fastlmm.github.io/PySnpTools/#snpreader-snpreader>`__. If you give a string, can be the name of a PLINK-formated Bed file.
               Can be PySnpTools `KernelReader <http://fastlmm.github.io/PySnpTools/#kernelreader-kernelreader>`__.
               If you give a string it can be the name of a `KernelNpz <http://fastlmm.github.io/PySnpTools/#kernelreader-kernelnpz>`__ file.
        :type K1_train: `SnpReader <http://fastlmm.github.io/PySnpTools/#snpreader-snpreader>`__ or a string or
               `KernelReader <http://fastlmm.github.io/PySnpTools/#kernelreader-kernelreader>`__

        :param h2raw: A parameter to LMM learning that tells how much weight to give the K's vs. the identity matrix, optional 
                If not given will search for best value.
                If mixing is unspecified, then h2 must also be unspecified.
        :type h2raw: number

        :param mixing: Weight between 0.0 (inclusive, default) and 1.0 (inclusive) given to K1_train relative to K0_train.
                If you give no mixing number and a K1_train is given, the best weight will be learned.
        :type mixing: number

        :param count_A1: If it needs to read SNP data from a BED-formatted file, tells if it should count the number of A1
             alleles (the PLINK standard) or the number of A2 alleles. False is the current default, but in the future the default will change to True.
        :type count_A1: bool

        :rtype: self, the fitted FastLMM predictor
        """
        with patch.dict('os.environ', {'ARRAY_MODULE': 'numpy'}) as _:

            self.is_fitted = True
            # should this have a cache file like 'single_snp'?
            #!!!later what happens if missing values in pheno_train?
            #!!!later add code so that X, y, etc can be array-like objects without iid information. In that case, make up iid info

            assert y is not None, "y must be given"

            y = _pheno_fixup(y, count_A1=count_A1)
            assert y.sid_count == 1, "Expect y to be just one variable"
            X = _pheno_fixup(X, iid_if_none=y.iid, count_A1=count_A1)

            K0_train = _kernel_fixup(K0_train,
                                     iid_if_none=y.iid,
                                     standardizer=self.snp_standardizer,
                                     count_A1=count_A1)
            K1_train = _kernel_fixup(K1_train,
                                     iid_if_none=y.iid,
                                     standardizer=self.snp_standardizer,
                                     count_A1=count_A1)

            K0_train, K1_train, X, y = intersect_apply(
                [K0_train, K1_train, X, y], intersect_before_standardize=True
            )  #!!! test this on both K's as None
            from fastlmm.association.single_snp import _set_block_size
            K0_train, K1_train, block_size = _set_block_size(
                K0_train, K1_train, mixing, self.GB_goal, self.force_full_rank,
                self.force_low_rank)

            X = X.read()
            # If possible, unit standardize train and test together. If that is not possible, unit standardize only train and later apply
            # the same linear transformation to test. Unit standardization is necessary for FastLMM to work correctly.
            #!!!later is the calculation of the training data's stats done twice???
            X, covar_unit_trained = X.standardize(
                self.covariate_standardizer,
                block_size=block_size,
                return_trained=True)  #This also fills missing with the mean

            # add a column of 1's to cov to increase DOF of model (and accuracy) by allowing a constant offset
            X = SnpData(iid=X.iid,
                        sid=self._new_snp_name(X),
                        val=np.c_[X.val, np.ones((X.iid_count, 1))],
                        name="covariate_train w/ 1's")

            y0 = y.read(
            ).val  #!!!later would view_ok=True,order='A' be ok because this code already did a fresh read to look for any missing values

            from fastlmm.association.single_snp import _Mixer  #!!!move _combine_the_best_way to another file (e.g. this one)
            K_train, h2raw, mixer = _Mixer.combine_the_best_way(
                K0_train,
                K1_train,
                X.val,
                y0,
                mixing,
                h2raw,
                force_full_rank=self.force_full_rank,
                force_low_rank=self.force_low_rank,
                kernel_standardizer=self.kernel_standardizer,
                block_size=block_size)

            # do final prediction using lmm.py
            lmm = LMM()

            #Special case: The K kernel is defined implicitly with SNP data
            if mixer.do_g:
                assert isinstance(
                    K_train.standardizer,
                    StandardizerIdentity), "Expect Identity standardizer"
                G_train = K_train.snpreader
                lmm.setG(G0=K_train.snpreader.val)
            else:
                lmm.setK(K0=K_train.val)

            lmm.setX(X.val)
            lmm.sety(y0[:, 0])

            # Find the best h2 and also on covariates (not given from new model)
            if h2raw is None:
                res = lmm.findH2()  #!!!why is REML true in the return???
            else:
                res = lmm.nLLeval(h2=h2raw)

            #We compute sigma2 instead of using res['sigma2'] because res['sigma2'] is only the pure noise.
            full_sigma2 = float(
                sum((np.dot(X.val, res['beta']).reshape(-1, 1) - y0)**
                    2)) / y.iid_count  #!!! this is non REML. Is that right?

            ###### all references to 'fastlmm_model' should be here so that we don't forget any
            self.block_size = block_size
            self.beta = res['beta']
            self.h2raw = res['h2']
            self.sigma2 = full_sigma2
            self.U = lmm.U
            self.S = lmm.S
            self.K = lmm.K
            self.G = lmm.G
            self.y = lmm.y
            self.Uy = lmm.Uy
            self.X = lmm.X
            self.UX = lmm.UX
            self.mixer = mixer
            self.covar_unit_trained = covar_unit_trained
            self.K_train_iid = K_train.iid
            self.covar_sid = X.sid
            self.pheno_sid = y.sid
            self.G0_train = K0_train.snpreader if isinstance(
                K0_train, SnpKernel) else None  #!!!later expensive?
            self.G1_train = K1_train.snpreader if isinstance(
                K1_train, SnpKernel) else None  #!!!later expensive?
            return self
Exemplo n.º 39
0
    def predict(self,
                X=None,
                K0_whole_test=None,
                K1_whole_test=None,
                iid_if_none=None,
                count_A1=None):
        """
        Method for predicting from a fitted :class:`FastLMM` predictor.
        If the examples in X, K0_whole_test, K1_whole_test are not the same, they will be reordered and intersected.

        :param X: testing covariate information, optional: 
          If you give a string, it should be the file name of a PLINK phenotype-formatted file.
        :type X: a PySnpTools `SnpReader <http://fastlmm.github.io/PySnpTools/#snpreader-snpreader>`__ (such as `Pheno <http://fastlmm.github.io/PySnpTools/#snpreader-pheno>`__ or `SnpData <http://fastlmm.github.io/PySnpTools/#snpreader-snpdata>`__) or string.

        :param K0_whole_test: A similarity matrix from all the examples to the test examples. Alternatively,
               the test SNPs needed to construct such a similarity matrix.
               Can be any `SnpReader <http://fastlmm.github.io/PySnpTools/#snpreader-snpreader>`__. If you give a string, can be the name of a PLINK-formated Bed file.
               Can be PySnpTools `KernelReader <http://fastlmm.github.io/PySnpTools/#kernelreader-kernelreader>`__. If you give a string it can be the name of a `KernelNpz <http://fastlmm.github.io/PySnpTools/#kernelreader-kernelnpz>`__ file.
        :type K0_whole_test: `SnpReader <http://fastlmm.github.io/PySnpTools/#snpreader-snpreader>`__ or a string or `KernelReader <http://fastlmm.github.io/PySnpTools/#kernelreader-kernelreader>`__

        :param K1_whole_test: A second similarity matrix from all the examples to the test examples. Alternatively,
               the test SNPs needed to construct such a similarity matrix.
               Can be any `SnpReader <http://fastlmm.github.io/PySnpTools/#snpreader-snpreader>`__. If you give a string, can be the name of a PLINK-formated Bed file.
               Can be PySnpTools `KernelReader <http://fastlmm.github.io/PySnpTools/#kernelreader-kernelreader>`__. If you give a string it can be the name of a `KernelNpz <http://fastlmm.github.io/PySnpTools/#kernelreader-kernelnpz>`__ file.
        :type K1_whole_test: `SnpReader <http://fastlmm.github.io/PySnpTools/#snpreader-snpreader>`__ or a string or `KernelReader <http://fastlmm.github.io/PySnpTools/#kernelreader-kernelreader>`__

        :param iid_if_none: Examples to predict for if no X, K0_whole_test, K1_whole_test is provided.
        :type iid_if_none: an ndarray of two strings

        :rtype: A `SnpData <http://fastlmm.github.io/PySnpTools/#snpreader-snpdata>`__ of the means and a :class:`KernelData` of the covariance
        """
        with patch.dict('os.environ', {'ARRAY_MODULE': 'numpy'}) as _:

            assert self.is_fitted, "Can only predict after predictor has been fitted"
            #assert K0_whole_test is not None, "K0_whole_test must be given"
            #!!!later is it too wasteful to keep both G0_train, G1_train, and lmm.G when storing to disk?
            #!!!later all _kernel_fixup's should use block_size input

            K0_whole_test_b = _kernel_fixup(
                K0_whole_test,
                train_snps=self.G0_train,
                iid_if_none=iid_if_none,
                standardizer=self.mixer.snp_trained0,
                test=K0_whole_test,
                test_iid_if_none=None,
                block_size=self.block_size,
                count_A1=count_A1)
            K1_whole_test = _kernel_fixup(
                K1_whole_test,
                train_snps=self.G1_train,
                iid_if_none=K0_whole_test_b.iid0,
                standardizer=self.mixer.snp_trained1,
                test=K1_whole_test,
                test_iid_if_none=K0_whole_test_b.iid1,
                block_size=self.block_size,
                count_A1=count_A1)
            X = _pheno_fixup(X,
                             iid_if_none=K0_whole_test_b.iid1,
                             count_A1=count_A1)
            K0_whole_test_c, K1_whole_test, X = intersect_apply(
                [K0_whole_test_b, K1_whole_test, X],
                intersect_before_standardize=True,
                is_test=True)
            X = X.read().standardize(self.covar_unit_trained)
            # add a column of 1's to cov to increase DOF of model (and accuracy) by allowing a constant offset
            X = SnpData(iid=X.iid,
                        sid=self._new_snp_name(X),
                        val=np.c_[X.read().val,
                                  np.ones((X.iid_count, 1))])
            assert np.array_equal(
                X.sid, self.covar_sid
            ), "Expect covar sids to be the same in train and test."

            train_idx0 = K0_whole_test_c.iid0_to_index(self.K_train_iid)
            K0_train_test = K0_whole_test_c[train_idx0, :]
            train_idx1 = K1_whole_test.iid0_to_index(self.K_train_iid)
            K1_train_test = K1_whole_test[train_idx1, :]
            test_idx0 = K0_whole_test_c.iid0_to_index(K0_whole_test_c.iid1)
            K0_test_test = K0_whole_test_c[test_idx0, :]
            if K0_test_test.iid0 is not K0_test_test.iid1:
                raise Exception("real assert")
            test_idx1 = K1_whole_test.iid0_to_index(K0_whole_test_c.iid1)
            K1_test_test = K1_whole_test[test_idx1, :]

            if self.mixer.do_g:
                ###################################################
                # low rank from Rasmussen  eq 2.9 + noise term added to covar
                ###################################################
                Gstar = self.mixer.g_mix(K0_train_test, K1_train_test)
                varg = self.h2raw * self.sigma2
                vare = (1. - self.h2raw) * self.sigma2
                Ainv = LA.inv((1. / vare) * np.dot(self.G.T, self.G) +
                              (1. / varg) * np.eye(self.G.shape[1]))
                testAinv = np.dot(Gstar.test.val, Ainv)
                pheno_predicted = np.dot(X.val, self.beta) + (
                    1. / vare) * np.dot(np.dot(testAinv, self.G.T),
                                        self.y - np.dot(self.X, self.beta))
                pheno_predicted = pheno_predicted.reshape(-1, 1)
                covar = np.dot(
                    testAinv,
                    Gstar.test.val.T) + vare * np.eye(Gstar.test.val.shape[0])

            else:
                lmm = LMM()
                lmm.U = self.U
                lmm.S = self.S
                lmm.G = self.G
                lmm.y = self.y
                lmm.Uy = self.Uy
                lmm.X = self.X
                lmm.UX = self.UX

                Kstar = self.mixer.k_mix(
                    K0_train_test, K1_train_test
                )  #!!!later do we need/want reads here? how about view_OK?
                lmm.setTestData(Xstar=X.val, K0star=Kstar.val.T)

                Kstar_star = self.mixer.k_mix(
                    K0_test_test, K1_test_test
                )  #!!!later do we need/want reads here?how about view_OK?
                pheno_predicted, covar = lmm.predict_mean_and_variance(
                    beta=self.beta,
                    h2=self.h2raw,
                    sigma2=self.sigma2,
                    Kstar_star=Kstar_star.val)

            #pheno_predicted = lmm.predictMean(beta=self.beta, h2=self.h2,scale=self.sigma2).reshape(-1,1)
            ret0 = SnpData(iid=X.iid,
                           sid=self.pheno_sid,
                           val=pheno_predicted,
                           pos=np.array([[np.nan, np.nan, np.nan]]),
                           name="lmm Prediction")

            from pysnptools.kernelreader import KernelData
            ret1 = KernelData(iid=K0_test_test.iid, val=covar)
            return ret0, ret1
    def test_old(self):
        do_plot = False
        from fastlmm.feature_selection.feature_selection_two_kernel import FeatureSelectionInSample
        from pysnptools.util import intersect_apply

        logging.info("TestSingleSnpAllPlusSelect test_old")

        bed_fn = self.pythonpath + "/tests/datasets/synth/all.bed"
        pheno_fn = self.pythonpath + "/tests/datasets/synth/pheno_10_causals.txt"
        cov_fn = self.pythonpath + "/tests/datasets/synth/cov.txt"

        #load data
        ###################################################################
        snp_reader = Bed(bed_fn)
        pheno = Pheno(pheno_fn)
        cov = Pheno(cov_fn)

        # intersect sample ids
        snp_reader, pheno, cov = intersect_apply([snp_reader, pheno, cov])

        # read in snps

        # partition snps on chr5 vs rest
        test_chr = 5
        G0 = snp_reader[:,snp_reader.pos[:,0] != test_chr].read(order='C').standardize()
        test_snps = snp_reader[:,snp_reader.pos[:,0] == test_chr].read(order='C').standardize()


        y = pheno.read().val[:,0]
        y -= y.mean()
        y /= y.std()

        # load covariates
        X_cov = cov.read().val
        X_cov.flags.writeable = False

        # invoke feature selection to learn which SNPs to use to build G1
        logging.info("running feature selection conditioned on background kernel")
        # partition data into the first 50 SNPs on chr1 and all but chr1

        select = FeatureSelectionInSample(max_log_k=7, n_folds=7, order_by_lmm=True, measure="ll", random_state=42)
        best_k, feat_idx, best_mix, best_delta = select.run_select(G0.val, G0.val, y, cov=X_cov)    

        # plot out of sample error
        if do_plot: select.plot_results(measure="ll")
        # select.plot_results(measure="mse")

        # print results
        logging.info("best_k:{0}".format(best_k))
        logging.info("best_mix:{0}".format(best_mix))
        logging.info("best_delta:{0}".format(best_delta))


        ###############################
        # use selected SNPs to build G1
        logging.info(feat_idx)
        G1 = G0[:,feat_idx]

        output_file_name = self.file_name("old")
        results_df = single_snp(test_snps, pheno, G0=G0, G1=G1, mixing=best_mix, h2=None,leave_out_one_chrom=False,output_file_name=output_file_name)

        logging.info("results:")
        logging.info("#"*40)
        logging.info(results_df.head())
        self.compare_files(results_df,"old")
Exemplo n.º 41
0
def single_snp(test_snps, pheno, K0=None,
                 K1=None, mixing=None,
                 covar=None, covar_by_chrom=None, leave_out_one_chrom=True, output_file_name=None, h2=None, log_delta=None,
                 cache_file = None, GB_goal=None, interact_with_snp=None, force_full_rank=False, force_low_rank=False, G0=None, G1=None, runner=None,
                 count_A1=None):
    """
    Function performing single SNP GWAS using cross validation over the chromosomes and REML. Will reorder and intersect IIDs as needed.
    (For backwards compatibility, you may use 'leave_out_one_chrom=False' to skip cross validation, but that is not recommended.)

    :param test_snps: SNPs to test. Can be any `SnpReader <http://fastlmm.github.io.github.io/PySnpTools/#snpreader-snpreader>`_. 
           If you give a string, it should be the base name of a set of PLINK Bed-formatted files.
           (For backwards compatibility can also be dictionary with keys 'vals', 'iid', 'header')
    :type test_snps: a `SnpReader <http://fastlmm.github.io.github.io/PySnpTools/#snpreader-snpreader>`_ or a string

    :param pheno: A single phenotype: Can be any `SnpReader <http://fastlmm.github.io.github.io/PySnpTools/#snpreader-snpreader>`_, for example,
           `Pheno <http://fastlmm.github.io.github.io/PySnpTools/#snpreader-pheno>`_ or `SnpData <http://fastlmm.github.io.github.io/PySnpTools/#snpreader-snpdata>`_.
           If you give a string, it should be the file name of a PLINK phenotype-formatted file.
           Any IIDs with missing values will be removed.
           (For backwards compatibility can also be dictionary with keys 'vals', 'iid', 'header')
    :type pheno: a `SnpReader <http://fastlmm.github.io.github.io/PySnpTools/#snpreader-snpreader>`_ or a string

    :param K0: SNPs from which to create a similarity matrix. If not given, will use test_snps.
           Can be any `SnpReader <http://fastlmm.github.io.github.io/PySnpTools/#snpreader-snpreader>`_. 
           If you give a string, it should be the base name of a set of PLINK Bed-formatted files.
           (When leave_out_one_chrom is False, can be a `KernelReader <http://fastlmm.github.io.github.io/PySnpTools/#kernelreader-kernelreader>`_
           or a `KernelNpz <http://fastlmm.github.io.github.io/PySnpTools/#kernelreader-kernelnpz>`_-formated file name.)
    :type K0: `SnpReader <http://fastlmm.github.io.github.io/PySnpTools/#snpreader-snpreader>`_ or a string
           (or `KernelReader <http://fastlmm.github.io.github.io/PySnpTools/#kernelreader-kernelreader>`_)

    :param K1: SNPs from which to create a second similarity matrix, optional. (Also, see 'mixing').
           Can be any `SnpReader <http://fastlmm.github.io.github.io/PySnpTools/#snpreader-snpreader>`_.
           If you give a string, it should be the base name of a set of PLINK Bed-formatted files.
           (When leave_out_one_chrom is False, can be a `KernelReader <http://fastlmm.github.io.github.io/PySnpTools/#kernelreader-kernelreader>`_
           or a `KernelNpz <http://fastlmm.github.io.github.io/PySnpTools/#kernelreader-kernelnpz>`_-formated file name.)
    :type K1: `SnpReader <http://fastlmm.github.io.github.io/PySnpTools/#snpreader-snpreader>`_ or a string
           (or `KernelReader <http://fastlmm.github.io.github.io/PySnpTools/#kernelreader-kernelreader>`_)

    :param mixing: Weight between 0.0 (inclusive, default) and 1.0 (inclusive) given to K1 relative to K0.
            If you give no mixing number and a K1 is given, the best weight will be learned.
    :type mixing: number

    :param covar: covariate information, optional: Can be any `SnpReader <http://fastlmm.github.io.github.io/PySnpTools/#snpreader-snpreader>`_, for example, `Pheno <http://fastlmm.github.io.github.io/PySnpTools/#snpreader-pheno>`_ or `SnpData <http://fastlmm.github.io.github.io/PySnpTools/#snpreader-snpdata>`_.
           If you give a string, it should be the file name of a PLINK phenotype-formatted file.
           (For backwards compatibility can also be dictionary with keys 'vals', 'iid', 'header')
    :type covar: a `SnpReader <http://fastlmm.github.io.github.io/PySnpTools/#snpreader-snpreader>`_ or a string

    :param leave_out_one_chrom: Perform single SNP GWAS via cross validation over the chromosomes. Default to True.
           (Warning: setting False can cause proximal contamination.)
    :type leave_out_one_chrom: boolean    

    :param output_file_name: Name of file to write results to, optional. If not given, no output file will be created. The output format is tab-delimited text.
    :type output_file_name: file name

    :param h2: A parameter to LMM learning, optional
            If not given will search for best value.
            If mixing is unspecified, then h2 must also be unspecified.
    :type h2: number

    :param log_delta: a re-parameterization of h2 provided for backwards compatibility. h2 is 1./(exp(log_delta)+1)
    :type log_delta: number

    :param cache_file: Name of  file to read or write cached precomputation values to, optional.
                If not given, no cache file will be used.
                If given and file does not exist, will write precomputation values to file.
                If given and file does exist, will read precomputation values from file.
                The file contains the U and S matrix from the decomposition of the training matrix. It is in Python's np.savez (\*.npz) format.
                Calls using the same cache file should have the same 'K0' and 'K1'
                If given and the file does exist then K0 and K1 need not be given.
    :type cache_file: file name

    :param GB_goal: gigabytes of memory the run should use, optional. If not given, will read the test_snps in blocks the same size as the kernel,
        which is memory efficient with little overhead on computation time.
    :type GB_goal: number

    :param interact_with_snp: index of a covariate to perform an interaction test with. 
            Allows for interaction testing (interact_with_snp x snp will be tested)
            default: None

    :param force_full_rank: Even if kernels are defined with fewer SNPs than IIDs, create an explicit iid_count x iid_count kernel. Cannot be True if force_low_rank is True.
    :type force_full_rank: Boolean

    :param force_low_rank: Even if kernels are defined with fewer IIDs than SNPs, create a low-rank iid_count x sid_count kernel. Cannot be True if force_full_rank is True.
    :type force_low_rank: Boolean

    :param G0: Same as K0. Provided for backwards compatibility. Cannot be given if K0 is given.
    :type G0: `SnpReader <http://fastlmm.github.io.github.io/PySnpTools/#snpreader-snpreader>`_ or a string (or `KernelReader <http://fastlmm.github.io.github.io/PySnpTools/#kernelreader-kernelreader>`_)

    :param G1: Same as K1. Provided for backwards compatibility. Cannot be given if K1 is given.
    :type G1: `SnpReader <http://fastlmm.github.io.github.io/PySnpTools/#snpreader-snpreader>`_ or a string (or `KernelReader <http://fastlmm.github.io.github.io/PySnpTools/#kernelreader-kernelreader>`_)

    :param runner: a `Runner <http://fastlmm.github.io.github.io/PySnpTools/#util-mapreduce1-runner-runner>`_, optional: Tells how to run locally, multi-processor, or on a cluster.
        If not given, the function is run locally.
    :type runner: `Runner <http://fastlmm.github.io.github.io/PySnpTools/#util-mapreduce1-runner-runner>`_

    :param count_A1: If it needs to read SNP data from a BED-formatted file, tells if it should count the number of A1
         alleles (the PLINK standard) or the number of A2 alleles. False is the current default, but in the future the default will change to True.
    :type count_A1: bool


    :rtype: Pandas dataframe with one row per test SNP. Columns include "PValue"



    :Example:

    >>> import logging
    >>> from fastlmm.association import single_snp
    >>> from pysnptools.snpreader import Bed
    >>> logging.basicConfig(level=logging.INFO)
    >>> pheno_fn = "../feature_selection/examples/toydata.phe"
    >>> results_dataframe = single_snp(test_snps="../feature_selection/examples/toydata.5chrom", pheno=pheno_fn, count_A1=False)
    >>> print results_dataframe.iloc[0].SNP,round(results_dataframe.iloc[0].PValue,7),len(results_dataframe)
    null_576 1e-07 10000


    """
    t0 = time.time()
    if force_full_rank and force_low_rank:
        raise Exception("Can't force both full rank and low rank")

    assert test_snps is not None, "test_snps must be given as input"
    test_snps = _snps_fixup(test_snps, count_A1=count_A1)
    pheno = _pheno_fixup(pheno, count_A1=count_A1).read()
    assert pheno.sid_count == 1, "Expect pheno to be just one variable"
    pheno = pheno[(pheno.val==pheno.val)[:,0],:]
    covar = _pheno_fixup(covar, iid_if_none=pheno.iid, count_A1=count_A1)

    if not leave_out_one_chrom:
        assert covar_by_chrom is None, "When 'leave_out_one_chrom' is False, 'covar_by_chrom' must be None"
        K0 = _kernel_fixup(K0 or G0 or test_snps, iid_if_none=test_snps.iid, standardizer=Unit(),count_A1=count_A1)
        K1 = _kernel_fixup(K1 or G1, iid_if_none=test_snps.iid, standardizer=Unit(),count_A1=count_A1)
        K0, K1, test_snps, pheno, covar  = pstutil.intersect_apply([K0, K1, test_snps, pheno, covar])
        logging.debug("# of iids now {0}".format(K0.iid_count))
        K0, K1, block_size = _set_block_size(K0, K1, mixing, GB_goal, force_full_rank, force_low_rank)

        frame =  _internal_single(K0=K0, test_snps=test_snps, pheno=pheno,
                                    covar=covar, K1=K1,
                                    mixing=mixing, h2=h2, log_delta=log_delta,
                                    cache_file = cache_file, force_full_rank=force_full_rank,force_low_rank=force_low_rank,
                                    output_file_name=output_file_name,block_size=block_size, interact_with_snp=interact_with_snp,
                                    runner=runner)
        sid_index_range = IntRangeSet(frame['sid_index'])
        assert sid_index_range == (0,test_snps.sid_count), "Some SNP rows are missing from the output"
    else: 
        chrom_list = list(set(test_snps.pos[:,0])) # find the set of all chroms mentioned in test_snps, the main testing data
        assert not np.isnan(chrom_list).any(), "chrom list should not contain NaN"
        input_files = [test_snps, pheno, K0, G0, K1, G1, covar] + ([] if covar_by_chrom is None else covar_by_chrom.values())

        def nested_closure(chrom):
            test_snps_chrom = test_snps[:,test_snps.pos[:,0]==chrom]
            covar_chrom = _create_covar_chrom(covar, covar_by_chrom, chrom)
            cache_file_chrom = None if cache_file is None else cache_file + ".{0}".format(chrom)

            K0_chrom = _K_per_chrom(K0 or G0 or test_snps, chrom, test_snps.iid)
            K1_chrom = _K_per_chrom(K1 or G1, chrom, test_snps.iid)

            K0_chrom, K1_chrom, test_snps_chrom, pheno_chrom, covar_chrom  = pstutil.intersect_apply([K0_chrom, K1_chrom, test_snps_chrom, pheno, covar_chrom])
            logging.debug("# of iids now {0}".format(K0_chrom.iid_count))
            K0_chrom, K1_chrom, block_size = _set_block_size(K0_chrom, K1_chrom, mixing, GB_goal, force_full_rank, force_low_rank)

            distributable = _internal_single(K0=K0_chrom, test_snps=test_snps_chrom, pheno=pheno_chrom,
                                        covar=covar_chrom, K1=K1_chrom,
                                        mixing=mixing, h2=h2, log_delta=log_delta, cache_file=cache_file_chrom,
                                        force_full_rank=force_full_rank,force_low_rank=force_low_rank,
                                        output_file_name=None, block_size=block_size, interact_with_snp=interact_with_snp,
                                        runner=Local())
            
            return distributable

        def reducer_closure(frame_sequence):
            frame = pd.concat(frame_sequence)
            frame.sort_values(by="PValue", inplace=True)
            frame.index = np.arange(len(frame))
            if output_file_name is not None:
                frame.to_csv(output_file_name, sep="\t", index=False)
            logging.info("PhenotypeName\t{0}".format(pheno.sid[0]))
            logging.info("SampleSize\t{0}".format(test_snps.iid_count))
            logging.info("SNPCount\t{0}".format(test_snps.sid_count))
            logging.info("Runtime\t{0}".format(time.time()-t0))

            return frame

        frame = map_reduce(chrom_list,
                   mapper = nested_closure,
                   reducer = reducer_closure,
                   input_files = input_files,
                   output_files = [output_file_name],
                   name = "single_snp (leave_out_one_chrom), out='{0}'".format(output_file_name),
                   runner = runner)

    return frame
    assert pheno.sid_count == 1, "Expect only one pheno in work_item"
    pheno = pheno.read()
    pheno = pheno[pheno.val[:, 0] == pheno.
                  val[:, 0], :]  #Excludes NaN because NaN is not equal to NaN

    #########################################
    # Environment: Turn spatial info info a KernelData
    #########################################
    spatial_val = spatial_similarity(spatial_coor, alpha, power=alpha_power)
    E_kernel = KernelData(iid=spatial_iid, val=spatial_val)

    #########################################
    # Intersect, apply the jackknife or permutation, and then (because we now know the iids) standardize appropriately
    #########################################
    from pysnptools.util import intersect_apply
    G_kernel, E_kernel, pheno = intersect_apply([G_kernel, E_kernel, pheno])

    if jackknife_index >= 0:
        assert jackknife_count <= G_kernel.iid_count, "expect the number of groups to be less than the number of iids"
        assert jackknife_index < jackknife_count, "expect the jackknife index to be less than the count"
        m_fold = model_selection.KFold(n_splits=jackknife_count,
                                       shuffle=True,
                                       random_state=jackknife_seed %
                                       4294967295).split(
                                           range(G_kernel.iid_count))
        iid_index, _ = _nth(m_fold, jackknife_index)
        pheno = pheno[iid_index, :]
        G_kernel = G_kernel[iid_index]
        E_kernel = E_kernel[iid_index]

    if permute_plus_index >= 0:
Exemplo n.º 43
0
    def fit(self,
            X=None,
            y=None,
            K0_train=None,
            K1_train=None,
            h2=None,
            mixing=None,
            count_A1=None):
        """
        Method for training a :class:`FastLMM` predictor. If the examples in X, y, K0_train, K1_train are not the same, they will be reordered and intersected.

        :param X: training covariate information, optional: 
          If you give a string, it should be the file name of a PLINK phenotype-formatted file.
        :type X: a PySnpTools `SnpReader <http://fastlmm.github.io/PySnpTools/#snpreader-snpreader>`__ (such as `Pheno <http://fastlmm.github.io/PySnpTools/#snpreader-pheno>`__ or `SnpData <http://fastlmm.github.io/PySnpTools/#snpreader-snpdata>`__) or string.

        :param y: training phenotype:
          If you give a string, it should be the file name of a PLINK phenotype-formatted file.
        :type y: a PySnpTools `SnpReader <http://fastlmm.github.io/PySnpTools/#snpreader-snpreader>`__ (such as `Pheno <http://fastlmm.github.io/PySnpTools/#snpreader-pheno>`__ or `SnpData <http://fastlmm.github.io/PySnpTools/#snpreader-snpdata>`__) or string.

        :param K0_train: Must be None. Represents the identity similarity matrix.
        :type K0_train: None

        :param K1_train: Must be None. Represents the identity similarity matrix.
        :type K1_train: `SnpReader <http://fastlmm.github.io/PySnpTools/#snpreader-snpreader>`__ or a string or `KernelReader <http://fastlmm.github.io/PySnpTools/#kernelreader-kernelreader>`__

        :param h2: Ignored. Optional.
        :type h2: number

        :param mixing: Ignored. Optional.
        :type mixing: number

        :param count_A1: If it needs to read SNP data from a BED-formatted file, tells if it should count the number of A1
             alleles (the PLINK standard) or the number of A2 alleles. False is the current default, but in the future the default will change to True.
        :type count_A1: bool


        :rtype: self, the fitted Linear Regression predictor
        """
        with patch.dict('os.environ', {'ARRAY_MODULE': 'numpy'}) as _:

            self.is_fitted = True
            assert K0_train is None  # could also accept that ID or no snps
            assert K1_train is None  # could also accept that ID or no snps

            assert y is not None, "y must be given"

            y = _pheno_fixup(y, count_A1=count_A1)
            assert y.sid_count == 1, "Expect y to be just one variable"
            X = _pheno_fixup(X, iid_if_none=y.iid, count_A1=count_A1)

            X, y = intersect_apply([X, y])
            y = y.read()
            X, covar_unit_trained = X.read().standardize(
                self.covariate_standardizer, return_trained=True)

            # add a column of 1's to cov to increase DOF of model (and accuracy) by allowing a constant offset
            X = SnpData(iid=X.iid,
                        sid=FastLMM._new_snp_name(X),
                        val=np.c_[X.val, np.ones((X.iid_count, 1))])

            lsqSol = np.linalg.lstsq(X.val, y.val[:, 0], rcond=-1)
            bs = lsqSol[0]  #weights
            r2 = lsqSol[1]  #squared residuals
            D = lsqSol[2]  #rank of design matrix
            N = y.iid_count

            self.beta = bs
            self.ssres = float(r2)
            self.sstot = ((y.val - y.val.mean())**2).sum()
            self.covar_unit_trained = covar_unit_trained
            self.iid_count = X.iid_count
            self.covar_sid = X.sid
            self.pheno_sid = y.sid
            return self
Exemplo n.º 44
0
def single_snp_all_plus_select(
        test_snps,
        pheno,
        G=None,
        covar=None,
        k_list=None,
        n_folds=10,  #1 is special and means test on train
        seed=0,
        output_file_name=None,
        GB_goal=None,
        force_full_rank=False,
        force_low_rank=False,
        mixing=None,
        h2=None,
        do_plot=False,
        runner=None,
        count_A1=None):
    """
    Function performing single SNP GWAS based on two kernels. The first kernel is based on all SNPs. The second kernel is a similarity matrix
    constructed of the top *k* SNPs where the SNPs are ordered via the PValue from :meth:`.single_snp` and *k* is determined via out-of-sample prediction.
    All work is done via 'leave_out_one_chrom', that one chromosome is tested and the kernels are constructed from the other chromosomes.
    Will reorder and intersect IIDs as needed.

    :param test_snps: SNPs to test. Can be any `SnpReader <http://fastlmm.github.io.github.io/PySnpTools/#snpreader-snpreader>`_. If you give a string, it should be the base name of a set of PLINK Bed-formatted files.
           (For backwards compatibility can also be dictionary with keys 'vals', 'iid', 'header')
    :type test_snps: a `SnpReader <http://fastlmm.github.io.github.io/PySnpTools/#snpreader-snpreader>`_ or a string

    :param pheno: A single phenotype: Can be any `SnpReader <http://fastlmm.github.io.github.io/PySnpTools/#snpreader-snpreader>`_, for example, `Pheno <http://fastlmm.github.io.github.io/PySnpTools/#snpreader-pheno>`_ or `SnpData <http://fastlmm.github.io.github.io/PySnpTools/#snpreader-snpdata>`_.
           If you give a string, it should be the file name of a PLINK phenotype-formatted file.
           Any IIDs with missing values will be removed.
           (For backwards compatibility can also be dictionary with keys 'vals', 'iid', 'header')
    :type pheno: a `SnpReader <http://fastlmm.github.io.github.io/PySnpTools/#snpreader-snpreader>`_ or a string

    :param G: SNPs from which to create a similarity matrix of the top *k* SNPs. If not given, will use test_snps.
           Can be any `SnpReader <http://fastlmm.github.io.github.io/PySnpTools/#snpreader-snpreader>`_. If you give a string, it should be the base name of a set of PLINK Bed-formatted files.
    :type G: `SnpReader <http://fastlmm.github.io.github.io/PySnpTools/#snpreader-snpreader>`_ or a string

    :param covar: covariate information, optional: Can be any `SnpReader <http://fastlmm.github.io.github.io/PySnpTools/#snpreader-snpreader>`_, for example, `Pheno <http://fastlmm.github.io.github.io/PySnpTools/#snpreader-pheno>`_ or `SnpData <http://fastlmm.github.io.github.io/PySnpTools/#snpreader-snpdata>`_.
           If you give a string, it should be the file name of a PLINK phenotype-formatted file.
           (For backwards compatibility can also be dictionary with keys 'vals', 'iid', 'header')
    :type covar: a `SnpReader <http://fastlmm.github.io.github.io/PySnpTools/#snpreader-snpreader>`_ or a string

    :param k_list: Values of *k* (in addition to 0) to test. Default to [1,2,4,8,...8192].
    :type k_list: list of numbers

    :param n_folds: Number of folds of cross validation to use for out-of-sample evaluation of various values of *k*. Default to 10.
    :type n_folds: number
    
    :param seed: (optional) Random seed used to generate permutations for lrt G0 fitting.
    :type seed: number

    :param output_file_name: Name of file to write results to, optional. If not given, no output file will be created.
    :type output_file_name: file name

    :param GB_goal: gigabytes of memory the run should use, optional. If not given, will read the test_snps in blocks the same size as the kernel,
        which is memory efficient with little overhead on computation time.
    :type GB_goal: number

    :param force_full_rank: Even if kernels are defined with fewer SNPs than IIDs, create an explicit iid_count x iid_count kernel. Cannot be True if force_low_rank is True.
    :type force_full_rank: Boolean

    :param force_low_rank: Even if kernels are defined with fewer IIDs than SNPs, create a low-rank iid_count x sid_count kernel. Cannot be True if force_full_rank is True.
    :type force_low_rank: Boolean

    :param mixing: A parameter to LMM learning telling how to combine the two kernels, optional
            If not given will search for best value.
    :type mixing: number

    :param h2: A parameter to LMM learning that tells how much weight to give the K's vs. the identity matrix, optional
            If not given will search for best value.
    :type h2: number

    :param do_plot: If true, will plot, for each chrom, the negative loglikelihood vs k.
    :type do_plot: boolean


    :param runner: a `Runner <http://fastlmm.github.io.github.io/PySnpTools/#util-mapreduce1-runner-runner>`_, optional: Tells how to run locally, multi-processor, or on a cluster.
        If not given, the function is run locally.
    :type runner: `Runner <http://fastlmm.github.io.github.io/PySnpTools/#util-mapreduce1-runner-runner>`_

    :param count_A1: If it needs to read SNP data from a BED-formatted file, tells if it should count the number of A1
         alleles (the PLINK standard) or the number of A2 alleles. False is the current default, but in the future the default will change to True.
    :type count_A1: bool


    :rtype: Pandas dataframe with one row per test SNP. Columns include "PValue"


    :Example:

    >>> import logging
    >>> import numpy as np
    >>> from fastlmm.association import single_snp_all_plus_select
    >>> from pysnptools.snpreader import Bed
    >>> from pysnptools.util.mapreduce1.runner import LocalMultiProc
    >>> logging.basicConfig(level=logging.INFO)
    >>> pheno_fn = "../feature_selection/examples/toydata.phe"
    >>> snps = Bed("../feature_selection/examples/toydata.5chrom.bed",count_A1=False)[:,::100] #To make example faster, run on only 1/100th of the data
    >>> chrom5_snps = snps[:,snps.pos[:,0]==5] # Test on only chrom5
    >>> results_dataframe = single_snp_all_plus_select(test_snps=chrom5_snps,G=snps,pheno=pheno_fn,GB_goal=2,runner=LocalMultiProc(20,mkl_num_threads=5), count_A1=False) #Run multiproc
    >>> print results_dataframe.iloc[0].SNP,round(results_dataframe.iloc[0].PValue,7),len(results_dataframe)
    null_9800 0.0793385 4

    """

    #=================================================
    # Start of definition of inner functions
    #=================================================
    def _best_snps_for_each_chrom(chrom_list, input_files, runner, G, n_folds,
                                  seed, pheno, covar, force_full_rank,
                                  force_low_rank, mixing, h2, k_list, GB_goal):
        #logging.info("Doing GWAS_1K for each chrom and fold. Work_count={0}".format(len(chrom_list)*(n_folds+1)))

        max_k = int(max(k_list))
        assert np.array_equal(G.iid, pheno.iid) and np.array_equal(
            G.iid, covar.iid), "real assert"

        def mapper_find_best_given_chrom(test_chr):
            G_for_chrom = _K_per_chrom(G, test_chr, G.iid).snpreader

            def mapper_gather_lots(i_fold_and_pair):
                i_fold, (train_idx, test_idx) = i_fold_and_pair
                logging.info(
                    "Working on GWAS_1K and k search, chrom={0}, i_fold={1}".
                    format(test_chr, i_fold))

                G_train = G_for_chrom[train_idx, :]

                #Precompute whole x whole standardized on train
                from fastlmm.association.single_snp import _internal_determine_block_size, _block_size_from_GB_goal
                min_count = _internal_determine_block_size(
                    G_for_chrom, None, None, force_full_rank, force_low_rank)
                block_size = _block_size_from_GB_goal(GB_goal,
                                                      G_for_chrom.iid_count,
                                                      min_count)
                K_whole_unittrain = _SnpWholeWithTrain(
                    whole=G_for_chrom,
                    train_idx=train_idx,
                    standardizer=Unit(),
                    block_size=block_size).read()

                assert np.array_equal(K_whole_unittrain.iid,
                                      G_for_chrom.iid), "real assert"
                K_train = K_whole_unittrain[train_idx]

                single_snp_result = single_snp(
                    test_snps=G_train,
                    K0=K_train,
                    pheno=
                    pheno,  #iid intersection means when can give the whole covariate and pheno
                    covar=covar,
                    leave_out_one_chrom=False,
                    GB_goal=GB_goal,
                    force_full_rank=force_full_rank,
                    force_low_rank=force_low_rank,
                    mixing=mixing,
                    h2=h2,
                    count_A1=count_A1)

                is_all = (i_fold == n_folds) if n_folds > 1 else True

                k_list_in = [0] + [
                    int(k)
                    for k in k_list if 0 < k and k < len(single_snp_result)
                ]

                if is_all:
                    top_snps = list(single_snp_result.SNP[:max_k])
                else:
                    top_snps = None

                if i_fold == n_folds:
                    k_index_to_nLL = None
                else:
                    k_index_to_nLL = []
                    for k in k_list_in:
                        top_k = G_for_chrom[:,
                                            G_for_chrom.sid_to_index(
                                                single_snp_result.SNP[:k])]
                        logging.info(
                            "Working on chr={0}, i_fold={1}, and K_{2}".format(
                                test_chr, i_fold, k))

                        top_k_train = top_k[train_idx, :] if k > 0 else None
                        fastlmm = FastLMM(force_full_rank=force_full_rank,
                                          force_low_rank=force_low_rank,
                                          GB_goal=GB_goal)
                        fastlmm.fit(
                            K0_train=K_train,
                            K1_train=top_k_train,
                            X=covar,
                            y=pheno,
                            mixing=mixing,
                            h2raw=h2
                        )  #iid intersection means when can give the whole covariate and pheno

                        top_k_test = top_k[test_idx, :] if k > 0 else None
                        K0_whole_test = K_whole_unittrain[:, test_idx]
                        nLL = fastlmm.score(
                            K0_whole_test=K0_whole_test,
                            K1_whole_test=top_k_test,
                            X=covar,
                            y=pheno
                        )  #iid intersection means when can give the whole covariate and pheno
                        k_index_to_nLL.append(nLL)

                if i_fold > 0:
                    k_list_in = None

                return k_list_in, top_snps, k_index_to_nLL

            def reducer_find_best(top_snps_and_k_index_to_nLL_sequence):
                #Starts fold_index+all -> k_index -> nll
                #Need:  k_index -> sum(fold_index -> nll)

                k_index_to_sum_nll = None
                top_snps_all = None
                k_list_in_all = None
                for i_fold, (k_list_in, top_snps, k_index_to_nLL) in enumerate(
                        top_snps_and_k_index_to_nLL_sequence):
                    if k_list_in is not None:
                        assert k_list_in_all is None, "real assert"
                        k_list_in_all = k_list_in
                        k_index_to_sum_nll = np.zeros(len(k_list_in))

                    if top_snps is not None:
                        assert top_snps_all is None, "real assert"
                        top_snps_all = top_snps

                    if k_index_to_nLL is not None:
                        assert i_fold < n_folds or n_folds == 1, "real assert"
                        for k_index, nLL in enumerate(k_index_to_nLL):
                            k_index_to_sum_nll[k_index] += nLL

                #find best # top_snps
                best_k = k_list_in_all[np.argmin(k_index_to_sum_nll)]
                logging.info("For chrom={0}, best_k={1}".format(
                    test_chr, best_k))
                if do_plot: _nll_plot(k_list_in_all, k_index_to_sum_nll)

                #Return the top snps from all
                result = top_snps_all[:best_k]
                return result

            i_fold_index_to_top_snps_and_k_index_to_nLL = map_reduce(
                _kfold(G_for_chrom.iid_count, n_folds, seed,
                       end_with_all=True),
                mapper=mapper_gather_lots,
                reducer=reducer_find_best)
            return i_fold_index_to_top_snps_and_k_index_to_nLL

        chrom_index_to_best_sid = map_reduce(
            chrom_list,
            nested=mapper_find_best_given_chrom,
            input_files=input_files,
            name="best snps for each chrom",
            runner=runner)
        return chrom_index_to_best_sid

    def _gwas_2k_via_loo_chrom(test_snps, chrom_list, input_files, runner, G,
                               chrom_index_to_best_sid, pheno, covar,
                               force_full_rank, force_low_rank, mixing, h2,
                               output_file_name, GB_goal):
        logging.info("Doing GWAS_2K for each chrom. Work_count={0}".format(
            len(chrom_list)))

        def mapper_single_snp_2K_given_chrom(test_chr):
            logging.info("Working on chr={0}".format(test_chr))
            test_snps_chrom = test_snps[:, test_snps.pos[:, 0] == test_chr]
            G_for_chrom = _K_per_chrom(G, test_chr, G.iid).snpreader
            chrom_index = chrom_list.index(test_chr)
            best_sid = chrom_index_to_best_sid[chrom_index]

            K1 = G_for_chrom[:, G_for_chrom.sid_to_index(best_sid)]
            result = single_snp(test_snps=test_snps_chrom,
                                K0=G_for_chrom,
                                K1=K1,
                                pheno=pheno,
                                covar=covar,
                                leave_out_one_chrom=False,
                                GB_goal=GB_goal,
                                force_full_rank=force_full_rank,
                                force_low_rank=force_low_rank,
                                mixing=mixing,
                                h2=h2,
                                count_A1=count_A1)
            return result

        def reducer_closure(
                frame_sequence):  #!!!very similar code in single_snp
            frame = pd.concat(frame_sequence)
            frame.sort_values(by="PValue", inplace=True)
            frame.index = np.arange(len(frame))
            if output_file_name is not None:
                frame.to_csv(output_file_name, sep="\t", index=False)
            logging.info("PhenotypeName\t{0}".format(pheno.sid[0]))
            logging.info("SampleSize\t{0}".format(G.iid_count))
            logging.info("SNPCount\t{0}".format(G.sid_count))

            return frame

        frame = map_reduce(chrom_list,
                           mapper=mapper_single_snp_2K_given_chrom,
                           reducer=reducer_closure,
                           input_files=input_files,
                           name="single_snp with two K's for all chroms",
                           runner=runner)
        return frame

    #=================================================
    # End of definition of inner functions
    #=================================================

    #!!!code similar to single_snp
    if force_full_rank and force_low_rank:
        raise Exception("Can't force both full rank and low rank")
    if k_list is None:
        k_list = np.logspace(start=0, stop=13, num=14, base=2)

    assert test_snps is not None, "test_snps must be given as input"
    test_snps = _snps_fixup(test_snps, count_A1=count_A1)
    G = _snps_fixup(G or test_snps, count_A1=count_A1)
    pheno = _pheno_fixup(pheno, count_A1=count_A1).read()
    assert pheno.sid_count == 1, "Expect pheno to be just one variable"
    pheno = pheno[(pheno.val == pheno.val)[:, 0], :]
    covar = _pheno_fixup(covar, iid_if_none=pheno.iid, count_A1=count_A1)
    chrom_list = list(
        set(test_snps.pos[:, 0])
    )  # find the set of all chroms mentioned in test_snps, the main testing data
    G, test_snps, pheno, covar = pstutil.intersect_apply(
        [G, test_snps, pheno, covar])
    common_input_files = [test_snps, G, pheno, covar]

    chrom_index_to_best_sid = _best_snps_for_each_chrom(
        chrom_list, common_input_files, runner, G, n_folds, seed, pheno, covar,
        force_full_rank, force_low_rank, mixing, h2, k_list, GB_goal)

    frame = _gwas_2k_via_loo_chrom(test_snps, chrom_list, common_input_files,
                                   runner, G, chrom_index_to_best_sid, pheno,
                                   covar, force_full_rank, force_low_rank,
                                   mixing, h2, output_file_name, GB_goal)

    return frame
Exemplo n.º 45
0
    def fit(self, X=None, y=None, K0_train=None, K1_train=None, h2=None, mixing=None):
        """
        Method for training a :class:`FastLMM` predictor. If the examples in X, y, K0_train, K1_train are not the same, they will be reordered and intersected.

        :param X: training covariate information, optional: 
          If you give a string, it should be the file name of a PLINK phenotype-formatted file.
        :type X: a PySnpTools :class:`SnpReader` (such as :class:`Pheno` or :class:`SnpData`) or string.

        :param y: training phenotype:
          If you give a string, it should be the file name of a PLINK phenotype-formatted file.
        :type y: a PySnpTools :class:`SnpReader` (such as :class:`Pheno` or :class:`SnpData`) or string.

        :param K0_train: A similarity matrix or SNPs from which to construct such a similarity matrix.
               Can be any :class:`.SnpReader`. If you give a string, can be the name of a PLINK-formated Bed file.
               Can be PySnpTools :class:`.KernelReader`. If you give a string it can be the name of a :class:`.KernelNpz` file.
        :type K0_train: :class:`.SnpReader` or a string or :class:`.KernelReader`

        :param K1_train: A second similarity matrix or SNPs from which to construct such a second similarity matrix. (Also, see 'mixing').
               Can be any :class:`.SnpReader`. If you give a string, can be the name of a PLINK-formated Bed file.
               Can be PySnpTools :class:`.KernelReader`. If you give a string it can be the name of a :class:`.KernelNpz` file.
        :type K1_train: :class:`.SnpReader` or a string or :class:`.KernelReader`

        :param h2: A parameter to LMM learning that tells how much weight to give the K's vs. the identity matrix, optional
                If not given will search for best value.
                If mixing is unspecified, then h2 must also be unspecified.
        :type h2: number

        :param mixing: Weight between 0.0 (inclusive, default) and 1.0 (inclusive) given to K1_train relative to K0_train.
                If you give no mixing number and a K1_train is given, the best weight will be learned.
        :type mixing: number


        :rtype: self, the fitted FastLMM predictor
        """
        self.is_fitted = True
        # should this have a cache file like 'single_snp'?
        #!!!later what happens if missing values in pheno_train?
        #!!!later add code so that X, y, etc can be array-like objects without iid information. In that case, make up iid info

        assert y is not None, "y must be given"

        y = _pheno_fixup(y)
        assert y.sid_count == 1, "Expect y to be just one variable"
        X = _pheno_fixup(X, iid_if_none=y.iid)

        K0_train = _kernel_fixup(K0_train, iid_if_none=y.iid, standardizer=self.snp_standardizer)
        K1_train = _kernel_fixup(K1_train, iid_if_none=y.iid, standardizer=self.snp_standardizer)

        K0_train, K1_train, X, y = intersect_apply([K0_train, K1_train, X, y],intersect_before_standardize=True) #!!! test this on both K's as None
        from fastlmm.association.single_snp import _set_block_size
        K0_train, K1_train, block_size = _set_block_size(K0_train, K1_train, mixing, self.GB_goal, self.force_full_rank, self.force_low_rank)

        X = X.read()
        # If possible, unit standardize train and test together. If that is not possible, unit standardize only train and later apply
        # the same linear transformation to test. Unit standardization is necessary for FastLMM to work correctly.
        #!!!later is the calculation of the training data's stats done twice???
        X, covar_unit_trained = X.standardize(self.covariate_standardizer,block_size=block_size,return_trained=True) #This also fills missing with the mean

        # add a column of 1's to cov to increase DOF of model (and accuracy) by allowing a constant offset
        X = SnpData(iid=X.iid,
                                sid=self._new_snp_name(X),
                                val=np.c_[X.val,np.ones((X.iid_count,1))],
                                name ="covariate_train w/ 1's")

        y0 =  y.read().val #!!!later would view_ok=True,order='A' be ok because this code already did a fresh read to look for any missing values 

        from fastlmm.association.single_snp import _Mixer #!!!move _combine_the_best_way to another file (e.g. this one)
        K_train, h2, mixer = _Mixer.combine_the_best_way(K0_train,K1_train,X.val,y0,mixing,h2,force_full_rank=self.force_full_rank,force_low_rank=self.force_low_rank,kernel_standardizer=self.kernel_standardizer,block_size=block_size)

        # do final prediction using lmm.py
        lmm = LMM()

        #Special case: The K kernel is defined implicitly with SNP data
        if mixer.do_g:
            assert isinstance(K_train.standardizer,StandardizerIdentity), "Expect Identity standardizer"
            G_train = K_train.snpreader
            lmm.setG(G0=K_train.snpreader.val)
        else:
            lmm.setK(K0=K_train.val)

        lmm.setX(X.val)
        lmm.sety(y0[:,0])

        # Find the best h2 and also on covariates (not given from new model)
        if h2 is None:
            res = lmm.findH2() #!!!why is REML true in the return???
        else:
            res = lmm.nLLeval(h2=h2)


        #We compute sigma2 instead of using res['sigma2'] because res['sigma2'] is only the pure noise.
        full_sigma2 = float(sum((np.dot(X.val,res['beta']).reshape(-1,1)-y0)**2))/y.iid_count #!!! this is non REML. Is that right?

        ###### all references to 'fastlmm_model' should be here so that we don't forget any
        self.block_size = block_size
        self.beta = res['beta']
        self.h2 = res['h2']
        self.sigma2 = full_sigma2
        self.U = lmm.U
        self.S = lmm.S
        self.K = lmm.K
        self.G = lmm.G
        self.y = lmm.y
        self.Uy = lmm.Uy
        self.X = lmm.X
        self.UX = lmm.UX
        self.mixer = mixer
        self.covar_unit_trained = covar_unit_trained
        self.K_train_iid = K_train.iid
        self.covar_sid = X.sid
        self.pheno_sid = y.sid
        self.G0_train = K0_train.snpreader if isinstance(K0_train,SnpKernel) else None #!!!later expensive?
        self.G1_train = K1_train.snpreader if isinstance(K1_train,SnpKernel) else None #!!!later expensive?
        return self
Exemplo n.º 46
0
SnpHdf5.write("deleteme1010.snp.hdf5", snpdata1010)

#Summary: Every format has its own SnpReader class
#       Table: Pheno, SnpNpz, SnpHdf5
#   That SnpReader has a static write method for SnpData

#Topics: Intersecting iids
#What if we have two data sources with slightly different iids in different order?
snpreader = Bed("all.bed")
phenoreader = Pheno("pheno_10_causals.txt")[::-2, :]
print snpreader.iid_count, phenoreader.iid_count, snpreader.iid, phenoreader.iid
#Create an intersecting and reordering reader with
import pysnptools.util as pstutil

snpreader_i, phenoreader_i = pstutil.intersect_apply([snpreader, phenoreader])
assert np.array_equal(snpreader_i.iid, phenoreader_i.iid)
snpdata_i = snpreader_i.read()
phenodata_i = phenoreader_i.read()

bs = np.linalg.lstsq(snpdata_i.val, phenodata_i.val,
                     rcond=-1)[0]  #usually would add a 1's column
predicted = snpdata_i.val.dot(bs)
import matplotlib.pyplot as plt

plt.plot(phenodata_i.val, predicted, '.', markersize=10)
#plt.show() #Easy to 'predict' seen 250 cases with 5000 variables.
# How does it predict unseen cases?
phenoreader_unseen = Pheno("pheno_10_causals.txt")[-2::-2, :]
snpreader_u, phenoreader_u = pstutil.intersect_apply(
    [snpreader, phenoreader_unseen])
Exemplo n.º 47
0
def work_item2(pheno, G_kernel, spatial_coor, spatial_iid, alpha, alpha_power,
               xxx_todo_changeme, xxx_todo_changeme1, xxx_todo_changeme2,
               just_testing, do_uncorr, do_gxe2, a2):

    #########################################
    # Load GPS info from filename if that's the way it is given
    ########################################
    (jackknife_index, jackknife_count, jackknife_seed) = xxx_todo_changeme
    (permute_plus_index, permute_plus_count,
     permute_plus_seed) = xxx_todo_changeme1
    (permute_times_index, permute_times_count,
     permute_times_seed) = xxx_todo_changeme2
    if isinstance(spatial_coor, str):
        assert spatial_iid is None, "if spatial_coor is a str, then spatial_iid should be None"
        gps_table = pd.read_csv(spatial_coor, delimiter=" ").dropna()
        spatial_iid = np.array([(v, v) for v in gps_table["id"].values])
        spatial_coor = gps_table[["south_new", "east_new"]].values

    #########################################
    # Remove any missing values from pheno
    ########################################
    assert pheno.sid_count == 1, "Expect only one pheno in work_item"
    pheno = pheno.read()
    pheno = pheno[pheno.val[:, 0] == pheno.
                  val[:, 0], :]  #Excludes NaN because NaN is not equal to NaN

    #########################################
    # Environment: Turn spatial info info a KernelData
    #########################################
    spatial_val = spatial_similarity(spatial_coor, alpha, power=alpha_power)
    E_kernel = KernelData(iid=spatial_iid, val=spatial_val)

    #########################################
    # Intersect, apply the jackknife or permutation, and then (because we now know the iids) standardize appropriately
    #########################################
    from pysnptools.util import intersect_apply
    G_kernel, E_kernel, pheno = intersect_apply([G_kernel, E_kernel, pheno])

    if jackknife_index >= 0:
        assert jackknife_count <= G_kernel.iid_count, "expect the number of groups to be less than the number of iids"
        assert jackknife_index < jackknife_count, "expect the jackknife index to be less than the count"
        m_fold = model_selection.KFold(n_splits=jackknife_count,
                                       shuffle=True,
                                       random_state=jackknife_seed %
                                       4294967295).split(
                                           list(range(G_kernel.iid_count)))
        iid_index, _ = _nth(m_fold, jackknife_index)
        pheno = pheno[iid_index, :]
        G_kernel = G_kernel[iid_index]
        E_kernel = E_kernel[iid_index]

    if permute_plus_index >= 0:
        #We shuffle the val, but not the iid, because that would cancel out.
        #Integrate the permute_plus_index into the random.
        np.random.seed((permute_plus_seed + permute_plus_index) % 4294967295)
        new_index = np.arange(G_kernel.iid_count)
        np.random.shuffle(new_index)
        E_kernel_temp = E_kernel[new_index].read()
        E_kernel = KernelData(
            iid=E_kernel.iid,
            val=E_kernel_temp.val,
            name="permutation {0}".format(permute_plus_index))

    pheno = pheno.read().standardize()  # defaults to Unit standardize
    G_kernel = G_kernel.read().standardize(
    )  # defaults to DiagKtoN standardize
    E_kernel = E_kernel.read().standardize(
    )  # defaults to DiagKtoN standardize

    #########################################
    # find h2uncoor, the best mixing weight of pure random noise and G_kernel
    #########################################

    if not do_uncorr:
        h2uncorr, nLLuncorr = np.nan, np.nan
    else:
        logging.info("Find best h2 for G_kernel")
        lmmg = LMM()
        lmmg.setK(K0=G_kernel.val)
        lmmg.setX(np.ones([G_kernel.iid_count, 1]))  # just a bias column
        lmmg.sety(pheno.val[:, 0])
        if not just_testing:
            resg = lmmg.findH2()
            h2uncorr, nLLuncorr = resg["h2"], resg["nLL"]
        else:
            h2uncorr, nLLuncorr = 0, 0
        logging.info("just G: h2uncorr: {0}, nLLuncorr: {1}".format(
            h2uncorr, nLLuncorr))

    #########################################
    # Find a2, the best mixing for G_kernel and E_kernel
    #########################################

    if a2 is None:
        logging.info("Find best mixing for G_kernel and E_kernel")
        lmm1 = LMM()
        lmm1.setK(K0=G_kernel.val, K1=E_kernel.val, a2=0.5)
        lmm1.setX(np.ones([G_kernel.iid_count, 1]))  # just a bias column
        lmm1.sety(pheno.val[:, 0])
        if not just_testing:
            res1 = lmm1.findA2()
            h2, a2, nLLcorr = res1["h2"], res1["a2"], res1["nLL"]
            h2corr = h2 * (1 - a2)
            e2 = h2 * a2
            h2corr_raw = h2
        else:
            h2corr, e2, a2, nLLcorr, h2corr_raw = 0, 0, .5, 0, 0
        logging.info(
            "G plus E mixture: h2corr: {0}, e2: {1}, a2: {2}, nLLcorr: {3} (h2corr_raw:{4})"
            .format(h2corr, e2, a2, nLLcorr, h2corr_raw))
    else:
        h2corr, e2, nLLcorr, h2corr_raw = np.nan, np.nan, np.nan, np.nan

    #########################################
    # Find a2_gxe2, the best mixing for G+E_kernel and the GxE kernel
    #########################################

    if not do_gxe2:
        gxe2, a2_gxe2, nLL_gxe2 = np.nan, np.nan, np.nan
    else:
        #Create the G+E kernel by mixing according to a2
        val = (1 - a2) * G_kernel.val + a2 * E_kernel.val
        GplusE_kernel = KernelData(iid=G_kernel.iid,
                                   val=val,
                                   name="{0} G + {1} E".format(1 - a2, a2))
        #Don't need to standardize GplusE_kernel because it's the weighted combination of standardized kernels

        # Create GxE Kernel and then find the best mixing of it and GplusE
        logging.info("Find best mixing for GxE and GplusE_kernel")

        val = G_kernel.val * E_kernel.val
        if permute_times_index >= 0:
            #We shuffle the val, but not the iid, because doing both would cancel out
            np.random.seed(
                (permute_times_seed + permute_times_index) % 4294967295)
            new_index = np.arange(G_kernel.iid_count)
            np.random.shuffle(new_index)
            val = pstutil.sub_matrix(val, new_index, new_index)

        GxE_kernel = KernelData(
            iid=G_kernel.iid, val=val, name="GxE"
        )  # recall that Python '*' is just element-wise multiplication
        GxE_kernel = GxE_kernel.standardize()

        lmm2 = LMM()
        lmm2.setK(K0=GplusE_kernel.val, K1=GxE_kernel.val, a2=0.5)
        lmm2.setX(np.ones([G_kernel.iid_count, 1]))  # just a bias column
        lmm2.sety(pheno.val[:, 0])
        if not just_testing:
            res2 = lmm2.findA2()
            gxe2, a2_gxe2, nLL_gxe2 = res2["h2"], res2["a2"], res2["nLL"]
            gxe2 *= a2_gxe2
        else:
            gxe2, a2_gxe2, nLL_gxe2 = 0, .5, 0
        logging.info(
            "G+E plus GxE mixture: gxe2: {0}, a2_gxe2: {1}, nLL_gxe2: {2}".
            format(gxe2, a2_gxe2, nLL_gxe2))

    #########################################
    # Return results
    #########################################

    ret = {
        "h2uncorr": h2uncorr,
        "nLLuncorr": nLLuncorr,
        "h2corr": h2corr,
        "h2corr_raw": h2corr_raw,
        "e2": e2,
        "a2": a2,
        "nLLcorr": nLLcorr,
        "gxe2": gxe2,
        "a2_gxe2": a2_gxe2,
        "nLL_gxe2": nLL_gxe2,
        "alpha": alpha,
        "alpha_power": alpha_power,
        "phen": np.array(pheno.sid, dtype='str')[0],
        "jackknife_index": jackknife_index,
        "jackknife_count": jackknife_count,
        "jackknife_seed": jackknife_seed,
        "permute_plus_index": permute_plus_index,
        "permute_plus_count": permute_plus_count,
        "permute_plus_seed": permute_plus_seed,
        "permute_times_index": permute_times_index,
        "permute_times_count": permute_times_count,
        "permute_times_seed": permute_times_seed
    }

    logging.info("run_line: {0}".format(ret))
    return ret
Exemplo n.º 48
0
def loadPheno(bed, phenoFile, missingPhenotype='-9', keepDict=False):
	pheno = phenoUtils.loadOnePhen(phenoFile, missing=missingPhenotype, vectorize=True)
	checkIntersection(bed, pheno, 'phenotypes')
	bed, pheno = pstutil.intersect_apply([bed, pheno])
	if (not keepDict): pheno = pheno['vals']
	return bed, pheno
Exemplo n.º 49
0
    def score(self,
              X=None,
              y=None,
              K0_whole_test=None,
              K1_whole_test=None,
              iid_if_none=None,
              return_mse_too=False,
              return_per_iid=False,
              count_A1=None):
        """
        Method for calculating the negative log likelihood of testing examples.
        If the examples in X,y,  K0_whole_test, K1_whole_test are not the same, they will be reordered and intersected.

        :param X: testing covariate information, optional: 
          If you give a string, it should be the file name of a PLINK phenotype-formatted file.
        :type X: a PySnpTools `SnpReader <http://fastlmm.github.io/PySnpTools/#snpreader-snpreader>`__ (such as `Pheno <http://fastlmm.github.io/PySnpTools/#snpreader-pheno>`__ or `SnpData <http://fastlmm.github.io/PySnpTools/#snpreader-snpdata>`__) or string.

        :param y: testing phenotype:
          If you give a string, it should be the file name of a PLINK phenotype-formatted file.
        :type y: a PySnpTools `SnpReader <http://fastlmm.github.io/PySnpTools/#snpreader-snpreader>`__ (such as `Pheno <http://fastlmm.github.io/PySnpTools/#snpreader-pheno>`__ or `SnpData <http://fastlmm.github.io/PySnpTools/#snpreader-snpdata>`__) or string.

        :param K0_whole_test: A similarity matrix from all the examples to the test examples. Alternatively,
               the test SNPs needed to construct such a similarity matrix.
               Can be any `SnpReader <http://fastlmm.github.io/PySnpTools/#snpreader-snpreader>`__. If you give a string, can be the name of a PLINK-formated Bed file.
               Can be PySnpTools `KernelReader <http://fastlmm.github.io/PySnpTools/#kernelreader-kernelreader>`__. If you give a string it can be the name of a `KernelNpz <http://fastlmm.github.io/PySnpTools/#kernelreader-kernelnpz>`__ file.
        :type K0_whole_test: `SnpReader <http://fastlmm.github.io/PySnpTools/#snpreader-snpreader>`__ or a string or `KernelReader <http://fastlmm.github.io/PySnpTools/#kernelreader-kernelreader>`__

        :param K1_whole_test: A second similarity matrix from all the examples to the test examples. Alternatively,
               the test SNPs needed to construct such a similarity matrix.
               Can be any `SnpReader <http://fastlmm.github.io/PySnpTools/#snpreader-snpreader>`__. If you give a string, can be the name of a PLINK-formated Bed file.
               Can be PySnpTools `KernelReader <http://fastlmm.github.io/PySnpTools/#kernelreader-kernelreader>`__. If you give a string it can be the name of a `KernelNpz <http://fastlmm.github.io/PySnpTools/#kernelreader-kernelnpz>`__ file.
        :type K1_whole_test: `SnpReader <http://fastlmm.github.io/PySnpTools/#snpreader-snpreader>`__ or a string or `KernelReader <http://fastlmm.github.io/PySnpTools/#kernelreader-kernelreader>`__

        :param iid_if_none: Examples to predict for if no X, K0_whole_test, K1_whole_test is provided.
        :type iid_if_none: an ndarray of two strings

        :param return_mse_too: If true, will also return the mean squared error.
        :type return_mse_too: bool

        :param count_A1: If it needs to read SNP data from a BED-formatted file, tells if it should count the number of A1
             alleles (the PLINK standard) or the number of A2 alleles. False is the current default, but in the future the default will change to True.
        :type count_A1: bool

        :param count_A1: If it needs to read SNP data from a BED-formatted file, tells if it should count the number of A1
             alleles (the PLINK standard) or the number of A2 alleles. False is the current default, but in the future the default will change to True.
        :type count_A1: bool

        :rtype: a float of the negative log likelihood and, optionally, a float of the mean squared error.
        """
        mean0, covar0 = self.predict(K0_whole_test=K0_whole_test,
                                     K1_whole_test=K1_whole_test,
                                     X=X,
                                     iid_if_none=iid_if_none,
                                     count_A1=count_A1)
        y = _pheno_fixup(y, iid_if_none=covar0.iid, count_A1=count_A1)
        mean, covar, y = intersect_apply([mean0, covar0, y])
        mean = mean.read(order='A', view_ok=True).val
        covar = covar.read(order='A', view_ok=True).val
        y_actual = y.read().val
        if not return_per_iid:
            var = multivariate_normal(mean=mean.reshape(-1), cov=covar)
            nll = -np.log(var.pdf(y_actual.reshape(-1)))
            if not return_mse_too:
                return nll
            else:
                mse = ((y_actual - mean)**2).sum()
                return nll, mse
        else:
            if not return_mse_too:
                result = SnpData(iid=y.iid,
                                 sid=['nLL'],
                                 val=np.empty((y.iid_count, 1)),
                                 name="nLL")
                for iid_index in range(y.iid_count):
                    var = multivariate_normal(mean=mean[iid_index],
                                              cov=covar[iid_index, iid_index])
                    nll = -np.log(var.pdf(y_actual[iid_index]))
                    result.val[iid_index, 0] = nll
                return result
            else:
                raise Exception("need code for mse_too")
Exemplo n.º 50
0
from pysnptools.snpreader import SnpHdf5
SnpHdf5.write("deleteme1010.snp.hdf5", snpdata1010)

#Summary: Every format has its own SnpReader class
#       Table: Pheno, SnpNpz, SnpHdf5
#   That SnpReader has a static write method for SnpData


#Topics: Intersecting iids
#What if we have two data sources with slightly different iids in different order?
snpreader = Bed("all.bed")
phenoreader = Pheno("pheno_10_causals.txt")[::-2,:]
print snpreader.iid_count, phenoreader.iid_count, snpreader.iid, phenoreader.iid
#Create an intersecting and reordering reader with
import pysnptools.util as pstutil
snpreader_i,phenoreader_i  = pstutil.intersect_apply([snpreader,phenoreader])
assert np.array_equal(snpreader_i.iid,phenoreader_i.iid)
snpdata_i = snpreader_i.read()
phenodata_i = phenoreader_i.read()

bs = np.linalg.lstsq(snpdata_i.val, phenodata_i.val,rcond=-1)[0] #usually would add a 1's column
predicted = snpdata_i.val.dot(bs)
import matplotlib.pyplot as plt
plt.plot(phenodata_i.val, predicted, '.', markersize=10)
#plt.show() #Easy to 'predict' seen 250 cases with 5000 variables.
# How does it predict unseen cases?
phenoreader_unseen = Pheno("pheno_10_causals.txt")[-2::-2,:]
snpreader_u,phenoreader_u  = pstutil.intersect_apply([snpreader,phenoreader_unseen])
snpdata_u = snpreader_u.read()
phenodata_u = phenoreader_u.read()
predicted_u = snpdata_u.val.dot(bs)
def work_item(arg_tuple):               
    (pheno, G_kernel, spatial_coor, spatial_iid, alpha,alpha_power,    # The main inputs
     (jackknife_index, jackknife_count, jackknife_seed),               # Jackknifing and permutations inputs
     (permute_plus_index, permute_plus_count, permute_plus_seed),
     (permute_times_index, permute_times_count, permute_times_seed),
     just_testing, do_uncorr, do_gxe2, a2) = arg_tuple                 # Shortcutting work

    #########################################
    # Remove any missing values from pheno
    #########################################
    pheno = pheno.read()
    pheno = pheno[pheno.val[:,0]==pheno.val[:,0],:] #Excludes NaN because NaN is not equal to NaN

    #########################################
    # Environment: Turn spatial info info a KernelData
    #########################################
    spatial_val = spatial_similarity(spatial_coor, alpha, power=alpha_power)
    E_kernel = KernelData(iid=spatial_iid,val=spatial_val)

    #########################################
    # Intersect, apply the jackknife or permutation, and then (because we now know the iids) standardize appropriately
    #########################################
    from pysnptools.util import intersect_apply
    G_kernel, E_kernel, pheno  = intersect_apply([G_kernel, E_kernel, pheno])

    if jackknife_index >= 0:
        assert jackknife_count <= G_kernel.iid_count, "expect the number of groups to be less than the number of iids"
        assert jackknife_index < jackknife_count, "expect the jackknife index to be less than the count"
        m_fold = cross_validation.KFold(n=G_kernel.iid_count, n_folds=jackknife_count, shuffle=True, random_state=jackknife_seed%4294967295)
        iid_index,_ = _nth(m_fold, jackknife_index)
        pheno = pheno[iid_index,:]
        G_kernel = G_kernel[iid_index]
        E_kernel = E_kernel[iid_index]

    if permute_plus_index >= 0:
        #We shuffle the val, but not the iid, because that would cancel out.
        #Integrate the permute_plus_index into the random.
        np.random.seed((permute_plus_seed + permute_plus_index)%4294967295)
        new_index = np.arange(G_kernel.iid_count)
        np.random.shuffle(new_index)
        E_kernel_temp = E_kernel[new_index].read()
        E_kernel = KernelData(iid=E_kernel.iid,val=E_kernel_temp.val,name="permutation {0}".format(permute_plus_index))

    pheno = pheno.read().standardize()       # defaults to Unit standardize
    G_kernel = G_kernel.read().standardize() # defaults to DiagKtoN standardize
    E_kernel = E_kernel.read().standardize() # defaults to DiagKtoN standardize

    #########################################
    # find h2uncoor, the best mixing weight of pure random noise and G_kernel
    #########################################

    if not do_uncorr:
        h2uncorr, nLLuncorr = np.nan,np.nan
    else:
        logging.info("Find best h2 for G_kernel")
        lmmg = LMM()
        lmmg.setK(K0=G_kernel.val)
        lmmg.setX(np.ones([G_kernel.iid_count,1])) # just a bias column
        lmmg.sety(pheno.val[:,0])
        if not just_testing:
            resg = lmmg.findH2()
            h2uncorr, nLLuncorr = resg["h2"], resg["nLL"]
        else:
            h2uncorr, nLLuncorr = 0,0
        logging.info("just G: h2uncorr: {0}, nLLuncorr: {1}".format(h2uncorr,nLLuncorr))
    
    #########################################
    # Find a2, the best mixing for G_kernel and E_kernel
    #########################################

    if a2 is None:
        logging.info("Find best mixing for G_kernel and E_kernel")
        lmm1 = LMM()
        lmm1.setK(K0=G_kernel.val, K1=E_kernel.val, a2=0.5)
        lmm1.setX(np.ones([G_kernel.iid_count,1])) # just a bias column
        lmm1.sety(pheno.val[:,0])
        if not just_testing:
            res1 = lmm1.findA2()
            h2, a2, nLLcorr = res1["h2"], res1["a2"], res1["nLL"]
            h2corr = h2 * (1-a2)
            e2 = h2 * a2
        else:
            h2corr, e2, a2, nLLcorr = 0,0,.5,0
        logging.info("G plus E mixture: h2corr: {0}, e2: {1}, a2: {2}, nLLcorr: {3}".format(h2corr,e2,a2,nLLcorr))
    else:
        h2corr, e2, nLLcorr = np.nan, np.nan, np.nan

    #########################################
    # Find a2_gxe2, the best mixing for G+E_kernel and the GxE kernel
    #########################################

    if not do_gxe2:
        gxe2, a2_gxe2, nLL_gxe2 = np.nan, np.nan, np.nan
    else:
        #Create the G+E kernel by mixing according to a2
        val=(1-a2)*G_kernel.val + a2*E_kernel.val
        GplusE_kernel = KernelData(iid=G_kernel.iid, val=val,name="{0} G + {1} E".format(1-a2,a2))
        #Don't need to standardize GplusE_kernel because it's the weighted combination of standardized kernels

        # Create GxE Kernel and then find the best mixing of it and GplusE
        logging.info("Find best mixing for GxE and GplusE_kernel")

        val=G_kernel.val * E_kernel.val
        if permute_times_index >= 0:
            #We shuffle the val, but not the iid, because doing both would cancel out
            np.random.seed((permute_times_seed + permute_times_index)%4294967295)
            new_index = np.arange(G_kernel.iid_count)
            np.random.shuffle(new_index)
            val = pstutil.sub_matrix(val, new_index, new_index)

        GxE_kernel = KernelData(iid=G_kernel.iid, val=val,name="GxE") # recall that Python '*' is just element-wise multiplication
        GxE_kernel = GxE_kernel.standardize()

        lmm2 = LMM()
        lmm2.setK(K0=GplusE_kernel.val, K1=GxE_kernel.val, a2=0.5)
        lmm2.setX(np.ones([G_kernel.iid_count,1])) # just a bias column
        lmm2.sety(pheno.val[:,0])
        if not just_testing:
            res2 = lmm2.findA2()
            gxe2, a2_gxe2, nLL_gxe2 = res2["h2"], res2["a2"], res2["nLL"]
            gxe2 *= a2_gxe2
        else:
            gxe2, a2_gxe2, nLL_gxe2 = 0,.5,0
        logging.info("G+E plus GxE mixture: gxe2: {0}, a2_gxe2: {1}, nLL_gxe2: {2}".format(gxe2, a2_gxe2, nLL_gxe2))
        
    #########################################
    # Return results
    #########################################

    ret = {"h2uncorr": h2uncorr, "nLLuncorr": nLLuncorr, "h2corr": h2corr, "e2":e2, "a2": a2, "nLLcorr": nLLcorr,
           "gxe2": gxe2, "a2_gxe2": a2_gxe2, "nLL_gxe2": nLL_gxe2, "alpha": alpha, "alpha_power":alpha_power, "phen": pheno.sid[0],
           "jackknife_index": jackknife_index, "jackknife_count":jackknife_count, "jackknife_seed":jackknife_seed,
           "permute_plus_index": permute_plus_index, "permute_plus_count":permute_plus_count, "permute_plus_seed":permute_plus_seed,
           "permute_times_index": permute_times_index, "permute_times_count":permute_times_count, "permute_times_seed":permute_times_seed
           }
    
    logging.info("run_line: {0}".format(ret))
    return ret
Exemplo n.º 52
0
def single_snp_linreg(test_snps, pheno, covar=None, max_output_len=None, output_file_name=None, GB_goal=None, runner=None):
    """
    Function performing single SNP GWAS using linear regression. Will reorder and intersect IIDs as needed.

    :param test_snps: SNPs to test. Can be any :class:`.SnpReader`. If you give a string, it should be the base name of a set of PLINK Bed-formatted files.
           (For backwards compatibility can also be dictionary with keys 'vals', 'iid', 'header')
    :type test_snps: a :class:`.SnpReader` or a string

    :param pheno: A single phenotype: Can be any :class:`.SnpReader`, for example, :class:`.Pheno` or :class:`.SnpData`.
           If you give a string, it should be the file name of a PLINK phenotype-formatted file.
           Any IIDs with missing values will be removed.
           (For backwards compatibility can also be dictionary with keys 'vals', 'iid', 'header')
    :type pheno: a :class:`.SnpReader` or a string

    :param covar: covariate information, optional: Can be any :class:`.SnpReader`, for example, :class:`.Pheno` or :class:`.SnpData`.
           If you give a string, it should be the file name of a PLINK phenotype-formatted file.
           (For backwards compatibility can also be dictionary with keys 'vals', 'iid', 'header')
    :type covar: a :class:`.SnpReader` or a string


    :param max_output_len: Maximum number of Pvalues to return. Default to None, which means 'Return all'.
    :type max_output_len: number
    
    :param output_file_name: Name of file to write results to, optional. If not given, no output file will be created.
    :type output_file_name: file name

    :param GB_goal: gigabytes of memory the run should use, optional. If not given, will read the test_snps in blocks of size iid_count,
        which is memory efficient with little overhead on computation time.
    :type GB_goal: number

    :param runner: a runner, optional: Tells how to run locally, multi-processor, or on a cluster.
        If not given, the function is run locally.
    :type runner: a runner.

    :rtype: Pandas dataframe with one row per test SNP. Columns include "PValue"


    :Example:

    >>> import logging
    >>> import numpy as np
    >>> from fastlmm.association import single_snp_linreg
    >>> from pysnptools.snpreader import Bed
    >>> logging.basicConfig(level=logging.INFO)
    >>> pheno_fn = "../feature_selection/examples/toydata.phe"
    >>> results_dataframe = single_snp_linreg(test_snps="../feature_selection/examples/toydata.5chrom", pheno=pheno_fn)
    >>> print results_dataframe.iloc[0].SNP,round(results_dataframe.iloc[0].PValue,7),len(results_dataframe)
    null_576 1e-07 10000


    """
    assert test_snps is not None, "test_snps must be given as input"
    test_snps = _snps_fixup(test_snps)
    pheno = _pheno_fixup(pheno).read()
    assert pheno.sid_count == 1, "Expect pheno to be just one variable"
    pheno = pheno[(pheno.val==pheno.val)[:,0],:]
    covar = _pheno_fixup(covar, iid_if_none=pheno.iid)
    test_snps, pheno, covar  = pstutil.intersect_apply([test_snps, pheno, covar])
    logging.debug("# of iids now {0}".format(test_snps.iid_count))

    _, _, block_size = _set_block_size(test_snps, None, 0, GB_goal, force_full_rank=False, force_low_rank=False)

    #!!!what about missing data in covar, in test_snps, in y
    covar = np.c_[covar.read(view_ok=True,order='A').val,np.ones((test_snps.iid_count, 1))]  #view_ok because np.c_ will allocation new memory
    y =  pheno.read(view_ok=True,order='A').val #view_ok because this code already did a fresh read to look for any missing values

    def mapper(start):
        snp_index = np.arange(start,min(start+block_size,test_snps.sid_count))
        x = test_snps[:,start:start+block_size].read().standardize().val
        _,pval_in = lin_reg.f_regression_cov_alt(x,y,covar)
        pval_in = pval_in.reshape(-1)

        if max_output_len is None:
            return pval_in,snp_index
        else: #We only need to return the top max_output_len results
            sort_index = np.argsort(pval_in)[:max_output_len]
            return pval_in[sort_index],snp_index[sort_index]

    def reducer(pval_and_snp_index_sequence):
        pval_list = []
        snp_index_list = []
        for pval, snp_index in pval_and_snp_index_sequence:
            pval_list.append(pval)
            snp_index_list.append(snp_index)
        pval = np.concatenate(pval_list)
        snp_index = np.concatenate(snp_index_list)
        sort_index = np.argsort(pval)
        if max_output_len is not None:
            sort_index = sort_index[:max_output_len]
        index = snp_index[sort_index]

        dataframe = pd.DataFrame(
            index=np.arange(len(index)),
            columns=('sid_index', 'SNP', 'Chr', 'GenDist', 'ChrPos', 'PValue')
            )
        #!!Is this the only way to set types in a dataframe?
        dataframe['sid_index'] = dataframe['sid_index'].astype(np.float)
        dataframe['Chr'] = dataframe['Chr'].astype(np.float)
        dataframe['GenDist'] = dataframe['GenDist'].astype(np.float)
        dataframe['ChrPos'] = dataframe['ChrPos'].astype(np.float)
        dataframe['PValue'] = dataframe['PValue'].astype(np.float)

        dataframe['sid_index'] = index
        dataframe['SNP'] = test_snps.sid[index]
        dataframe['Chr'] = test_snps.pos[index,0]
        dataframe['GenDist'] = test_snps.pos[index,1]
        dataframe['ChrPos'] = test_snps.pos[index,2]
        dataframe['PValue'] = pval[sort_index]

        if output_file_name is not None:
            dataframe.to_csv(output_file_name, sep="\t", index=False)

        return dataframe

    dataframe = map_reduce(xrange(0,test_snps.sid_count,block_size),
                           mapper=mapper,
                           reducer=reducer,
                           input_files=[test_snps,pheno,covar],
                           output_files=[output_file_name],
                           name = "single_snp_linreg",
                           runner=runner)
    return dataframe
Exemplo n.º 53
0
def loadCovars(bed, covarFile):
	covarsDict = phenoUtils.loadPhen(covarFile)
	checkIntersection(bed, covarsDict, 'covariates', checkSuperSet=True)
	_, covarsDict = pstutil.intersect_apply([bed, covarsDict])
	covar = covarsDict['vals']
	return covar	
Exemplo n.º 54
0
def single_snp(test_snps, pheno, K0=None,
                 K1=None, mixing=None,
                 covar=None, covar_by_chrom=None, leave_out_one_chrom=True, output_file_name=None, h2=None, log_delta=None,
                 cache_file = None, GB_goal=None, interact_with_snp=None, force_full_rank=False, force_low_rank=False, G0=None, G1=None, runner=None):
    """
    Function performing single SNP GWAS using cross validation over the chromosomes and REML. Will reorder and intersect IIDs as needed.
    (For backwards compatibility, you may use 'leave_out_one_chrom=False' to skip cross validation, but that is not recommended.)

    :param test_snps: SNPs to test. Can be any :class:`.SnpReader`. If you give a string, it should be the base name of a set of PLINK Bed-formatted files.
           (For backwards compatibility can also be dictionary with keys 'vals', 'iid', 'header')
    :type test_snps: a :class:`.SnpReader` or a string

    :param pheno: A single phenotype: Can be any :class:`.SnpReader`, for example, :class:`.Pheno` or :class:`.SnpData`.
           If you give a string, it should be the file name of a PLINK phenotype-formatted file.
           Any IIDs with missing values will be removed.
           (For backwards compatibility can also be dictionary with keys 'vals', 'iid', 'header')
    :type pheno: a :class:`.SnpReader` or a string

    :param K0: SNPs from which to create a similarity matrix. If not given, will use test_snps.
           Can be any :class:`.SnpReader`. If you give a string, it should be the base name of a set of PLINK Bed-formatted files.
           (When leave_out_one_chrom is False, can be a :class:`.KernelReader` or a :class:`.KernelNpz`-formated file name.)
    :type K0: :class:`.SnpReader` or a string (or :class:`.KernelReader`)

    :param K1: SNPs from which to create a second similarity matrix, optional. (Also, see 'mixing').
           Can be any :class:`.SnpReader`. If you give a string, it should be the base name of a set of PLINK Bed-formatted files.
           (When leave_out_one_chrom is False, can be a :class:`.KernelReader` or a :class:`.KernelNpz`-formated file name.)
    :type K1: :class:`.SnpReader` or a string (or :class:`.KernelReader`)

    :param mixing: Weight between 0.0 (inclusive, default) and 1.0 (inclusive) given to K1 relative to K0.
            If you give no mixing number and a K1 is given, the best weight will be learned.
    :type mixing: number

    :param covar: covariate information, optional: Can be any :class:`.SnpReader`, for example, :class:`.Pheno` or :class:`.SnpData`.
           If you give a string, it should be the file name of a PLINK phenotype-formatted file.
           (For backwards compatibility can also be dictionary with keys 'vals', 'iid', 'header')
    :type covar: a :class:`.SnpReader` or a string

    :param leave_out_one_chrom: Perform single SNP GWAS via cross validation over the chromosomes. Default to True.
           (Warning: setting False can cause proximal contamination.)
    :type leave_out_one_chrom: boolean
    

    :param output_file_name: Name of file to write results to, optional. If not given, no output file will be created.
    :type output_file_name: file name

    :param h2: A parameter to LMM learning, optional
            If not given will search for best value.
            If mixing is unspecified, then h2 must also be unspecified.
    :type h2: number

    :param log_delta: a re-parameterization of h2 provided for backwards compatibility. h2 is 1./(exp(log_delta)+1)
    :type log_delta: number

    :param cache_file: Name of  file to read or write cached precomputation values to, optional.
                If not given, no cache file will be used.
                If given and file does not exist, will write precomputation values to file.
                If given and file does exist, will read precomputation values from file.
                The file contains the U and S matrix from the decomposition of the training matrix. It is in Python's np.savez (\*.npz) format.
                Calls using the same cache file should have the same 'K0' and 'K1'
                If given and the file does exist then K0 and K1 need not be given.
    :type cache_file: file name

    :param GB_goal: gigabytes of memory the run should use, optional. If not given, will read the test_snps in blocks the same size as the kernel,
        which is memory efficient with little overhead on computation time.
    :type GB_goal: number

    :param interact_with_snp: index of a covariate to perform an interaction test with. 
            Allows for interaction testing (interact_with_snp x snp will be tested)
            default: None

    :param force_full_rank: Even if kernels are defined with fewer SNPs than IIDs, create an explicit iid_count x iid_count kernel. Cannot be True if force_low_rank is True.
    :type force_full_rank: Boolean

    :param force_low_rank: Even if kernels are defined with fewer IIDs than SNPs, create a low-rank iid_count x sid_count kernel. Cannot be True if force_full_rank is True.
    :type force_low_rank: Boolean

    :param G0: Same as K0. Provided for backwards compatibility. Cannot be given if K0 is given.
    :type G0: :class:`.SnpReader` or a string (or :class:`.KernelReader`)

    :param G1: Same as K1. Provided for backwards compatibility. Cannot be given if K1 is given.
    :type G1: :class:`.SnpReader` or a string (or :class:`.KernelReader`)

    :param runner: a runner, optional: Tells how to run locally, multi-processor, or on a cluster.
        If not given, the function is run locally.
    :type runner: a runner.

    :rtype: Pandas dataframe with one row per test SNP. Columns include "PValue"



    :Example:

    >>> import logging
    >>> import numpy as np
    >>> from fastlmm.association import single_snp
    >>> from pysnptools.snpreader import Bed
    >>> logging.basicConfig(level=logging.INFO)
    >>> pheno_fn = "../feature_selection/examples/toydata.phe"
    >>> results_dataframe = single_snp(test_snps="../feature_selection/examples/toydata.5chrom", pheno=pheno_fn)
    >>> print results_dataframe.iloc[0].SNP,round(results_dataframe.iloc[0].PValue,7),len(results_dataframe)
    null_576 1e-07 10000


    """
    t0 = time.time()
    if force_full_rank and force_low_rank:
        raise Exception("Can't force both full rank and low rank")

    assert test_snps is not None, "test_snps must be given as input"
    test_snps = _snps_fixup(test_snps)
    pheno = _pheno_fixup(pheno).read()
    assert pheno.sid_count == 1, "Expect pheno to be just one variable"
    pheno = pheno[(pheno.val==pheno.val)[:,0],:]
    covar = _pheno_fixup(covar, iid_if_none=pheno.iid)

    if not leave_out_one_chrom:
        assert covar_by_chrom is None, "When 'leave_out_one_chrom' is False, 'covar_by_chrom' must be None"
        K0 = _kernel_fixup(K0 or G0 or test_snps, iid_if_none=test_snps.iid, standardizer=Unit())
        K1 = _kernel_fixup(K1 or G1, iid_if_none=test_snps.iid, standardizer=Unit())
        K0, K1, test_snps, pheno, covar  = pstutil.intersect_apply([K0, K1, test_snps, pheno, covar])
        logging.debug("# of iids now {0}".format(K0.iid_count))
        K0, K1, block_size = _set_block_size(K0, K1, mixing, GB_goal, force_full_rank, force_low_rank)

        frame =  _internal_single(K0=K0, test_snps=test_snps, pheno=pheno,
                                    covar=covar, K1=K1,
                                    mixing=mixing, h2=h2, log_delta=log_delta,
                                    cache_file = cache_file, force_full_rank=force_full_rank,force_low_rank=force_low_rank,
                                    output_file_name=output_file_name,block_size=block_size, interact_with_snp=interact_with_snp,
                                    runner=runner)
        sid_index_range = IntRangeSet(frame['sid_index'])
        assert sid_index_range == (0,test_snps.sid_count), "Some SNP rows are missing from the output"
    else:
        chrom_list = list(set(test_snps.pos[:,0])) # find the set of all chroms mentioned in test_snps, the main testing data

        input_files = [test_snps, pheno, K0, G0, K1, G1, covar] + ([] if covar_by_chrom is None else covar_by_chrom.values())

        def nested_closure(chrom):
            test_snps_chrom = test_snps[:,test_snps.pos[:,0]==chrom]
            covar_chrom = _create_covar_chrom(covar, covar_by_chrom, chrom)

            K0_chrom = _K_per_chrom(K0 or G0 or test_snps, chrom, test_snps.iid)
            K1_chrom = _K_per_chrom(K1 or G1, chrom, test_snps.iid)

            K0_chrom, K1_chrom, test_snps_chrom, pheno_chrom, covar_chrom  = pstutil.intersect_apply([K0_chrom, K1_chrom, test_snps_chrom, pheno, covar_chrom])
            logging.debug("# of iids now {0}".format(K0_chrom.iid_count))
            K0_chrom, K1_chrom, block_size = _set_block_size(K0_chrom, K1_chrom, mixing, GB_goal, force_full_rank, force_low_rank)

            distributable = _internal_single(K0=K0_chrom, test_snps=test_snps_chrom, pheno=pheno_chrom,
                                        covar=covar_chrom, K1=K1_chrom,
                                        mixing=mixing, h2=h2, log_delta=log_delta, cache_file=None,
                                        force_full_rank=force_full_rank,force_low_rank=force_low_rank,
                                        output_file_name=None, block_size=block_size, interact_with_snp=interact_with_snp,
                                        runner=Local())
            
            return distributable

        def reducer_closure(frame_sequence):
            frame = pd.concat(frame_sequence)
            frame.sort_values(by="PValue", inplace=True)
            frame.index = np.arange(len(frame))
            if output_file_name is not None:
                frame.to_csv(output_file_name, sep="\t", index=False)
            logging.info("PhenotypeName\t{0}".format(pheno.sid[0]))
            logging.info("SampleSize\t{0}".format(test_snps.iid_count))
            logging.info("SNPCount\t{0}".format(test_snps.sid_count))
            logging.info("Runtime\t{0}".format(time.time()-t0))

            return frame

        frame = map_reduce(chrom_list,
                   mapper = nested_closure,
                   reducer = reducer_closure,
                   input_files = input_files,
                   output_files = [output_file_name],
                   name = "single_snp (leave_out_one_chrom), out='{0}'".format(output_file_name),
                   runner = runner)

    return frame
Exemplo n.º 55
0
    def score(self, X=None, y=None, K0_whole_test=None, K1_whole_test=None, iid_if_none=None, return_mse_too=False, return_per_iid=False, count_A1=None):
        """
        Method for calculating the negative log likelihood of testing examples.
        If the examples in X,y,  K0_whole_test, K1_whole_test are not the same, they will be reordered and intersected.

        :param X: testing covariate information, optional: 
          If you give a string, it should be the file name of a PLINK phenotype-formatted file.
        :type X: a PySnpTools :class:`SnpReader` (such as :class:`Pheno` or :class:`SnpData`) or string.

        :param y: testing phenotype:
          If you give a string, it should be the file name of a PLINK phenotype-formatted file.
        :type y: a PySnpTools :class:`SnpReader` (such as :class:`Pheno` or :class:`SnpData`) or string.

        :param K0_whole_test: A similarity matrix from all the examples to the test examples. Alternatively,
               the test SNPs needed to construct such a similarity matrix.
               Can be any :class:`.SnpReader`. If you give a string, can be the name of a PLINK-formated Bed file.
               Can be PySnpTools :class:`.KernelReader`. If you give a string it can be the name of a :class:`.KernelNpz` file.
        :type K0_whole_test: :class:`.SnpReader` or a string or :class:`.KernelReader`

        :param K1_whole_test: A second similarity matrix from all the examples to the test examples. Alternatively,
               the test SNPs needed to construct such a similarity matrix.
               Can be any :class:`.SnpReader`. If you give a string, can be the name of a PLINK-formated Bed file.
               Can be PySnpTools :class:`.KernelReader`. If you give a string it can be the name of a :class:`.KernelNpz` file.
        :type K1_whole_test: :class:`.SnpReader` or a string or :class:`.KernelReader`

        :param iid_if_none: Examples to predict for if no X, K0_whole_test, K1_whole_test is provided.
        :type iid_if_none: an ndarray of two strings

        :param return_mse_too: If true, will also return the mean squared error.
        :type return_mse_too: bool

        :param count_A1: If it needs to read SNP data from a BED-formatted file, tells if it should count the number of A1
             alleles (the PLINK standard) or the number of A2 alleles. False is the current default, but in the future the default will change to True.
        :type count_A1: bool

        :param count_A1: If it needs to read SNP data from a BED-formatted file, tells if it should count the number of A1
             alleles (the PLINK standard) or the number of A2 alleles. False is the current default, but in the future the default will change to True.
        :type count_A1: bool

        :rtype: a float of the negative log likelihood and, optionally, a float of the mean squared error.
        """
        mean0, covar0 = self.predict(K0_whole_test=K0_whole_test,K1_whole_test=K1_whole_test,X=X,iid_if_none=iid_if_none,count_A1=count_A1)
        y = _pheno_fixup(y, iid_if_none=covar0.iid,count_A1=count_A1)
        mean, covar, y = intersect_apply([mean0, covar0, y])
        mean = mean.read(order='A',view_ok=True).val
        covar = covar.read(order='A',view_ok=True).val
        y_actual = y.read().val
        if not return_per_iid:
            var = multivariate_normal(mean=mean.reshape(-1), cov=covar)
            nll = -np.log(var.pdf(y_actual.reshape(-1)))
            if not return_mse_too:
                return nll
            else:
                mse = ((y_actual-mean)**2).sum()
                return nll, mse
        else:
            if not return_mse_too:
                result = SnpData(iid=y.iid,sid=['nLL'],val=np.empty((y.iid_count,1)),name="nLL")
                for iid_index in xrange(y.iid_count):
                    var = multivariate_normal(mean=mean[iid_index], cov=covar[iid_index,iid_index])
                    nll = -np.log(var.pdf(y_actual[iid_index]))
                    result.val[iid_index,0] = nll
                return result
            else:
               raise Exception("need code for mse_too")                                  
Exemplo n.º 56
0
def single_snp_leave_out_one_chrom(test_snps, pheno,
                 G1=None, mixing=0.0, #!!test mixing and G1
                 covar=None,covar_by_chrom=None,
                 h2=None, log_delta=None, output_file_name=None):
    """
    Function performing single SNP GWAS via cross validation over the chromosomes with REML

    :param test_snps: SNPs to test and to construct similarity matrix.
          If you give a string, it should be the base name of a set of PLINK Bed-formatted files.
    :type test_snps: a :class:`.SnpReader` or a string

    :param pheno: A single phenotype: A 'pheno dictionary' contains an ndarray on the 'vals' key and a iid list on the 'iid' key.
      If you give a string, it should be the file name of a PLINK phenotype-formatted file.
    :type pheno: a 'pheno dictionary' or a string


    :param G1: SNPs from which to construct a second simalirty matrix, optional. Also, see 'mixing').
          If you give a string, it should be the base name of a set of PLINK Bed-formatted files.
    :type G1: a :class:`.SnpReader` or a string

    :param mixing: Weight between 0.0 (inclusive, default) and 1.0 (inclusive) given to G1 relative to G0.
            If you give no mixing number, G0 will get all the weight and G1 will be ignored.
    :type mixing: number

    :param covar: covariate information, optional: A 'pheno dictionary' contains an ndarray on the 'vals' key and a iid list on the 'iid' key.
      If you give a string, it should be the file name of a PLINK phenotype-formatted file.
    :type covar: a 'pheno dictionary' or a string

    :param covar_by_chrom: covariate information, optional: A way to give different covariate information for each chromosome.
            It is a dictionary from chromosome number to a 'pheno dictionary' or a string
    :type covar_by_chrom: A dictionary from chromosome number to a 'pheno dictionary' or a string

    :param output_file_name: Name of file to write results to, optional. If not given, no output file will be created.
    :type output_file_name: file name

    :param h2: A parameter to LMM learning, optional
            If not given will search for best value.
            If mixing is unspecified, then h2 must also be unspecified.
    :type h2: number

    :param log_delta: a re-parameterization of h2 provided for backwards compatibility.
    :type log_delta: number


    :rtype: Pandas dataframe with one row per test SNP. Columns include "PValue"

    :Example:

    >>> import logging
    >>> import numpy as np
    >>> from fastlmm.association import single_snp_leave_out_one_chrom
    >>> from pysnptools.snpreader import Bed
    >>> logging.basicConfig(level=logging.INFO)
    >>> pheno_fn = "../feature_selection/examples/toydata.phe"
    >>> results_dataframe = single_snp_leave_out_one_chrom(test_snps="../feature_selection/examples/toydata.5chrom", pheno=pheno_fn, h2=.2)
    >>> print results_dataframe.iloc[0].SNP,round(results_dataframe.iloc[0].PValue,7),len(results_dataframe)
    null_576 1e-07 10000

    """
    t0 = time.time()
    test_snps = _snp_fixup(test_snps)
    G1 = _snp_fixup(G1, iid_source_if_none=test_snps)
    pheno = _pheno_fixup(pheno)
    covar = _pheno_fixup(covar, iid_source_if_none=pheno)
    test_snps, G1, pheno, covar,  = pstutil.intersect_apply([test_snps, G1, pheno, covar])
    G0_standardized = test_snps.read().standardize()
    G1_standardized = G1.read().standardize()

    chrom_set = set(G0_standardized.pos[:,0]) # find the set of all chroms mentioned in G0_standardized, the main training data
    assert len(chrom_set) > 1, "single_leave_out_one_chrom requires more than one chromosome"
    frame_list = []
    for chrom in chrom_set:
        #!!is it OK to read (and standardize) G0_standardized and G1 over and over again, once for each chrom?
        G0_standardized_chrom = G0_standardized[:,G0_standardized.pos[:,0] != chrom].read() # train on snps that don't match this chrom
        test_snps_chrom = G0_standardized[:,G0_standardized.pos[:,0] == chrom].read() # test on snps that do match this chrom
        G1_standardized_chrom = G1_standardized[:,G1_standardized.pos[:,0] != chrom].read() # train on snps that don't match the chrom
        covar_chrom = _create_covar_chrom(covar, covar_by_chrom, chrom)

        frame_chrom = _internal_single(G0_standardized=G0_standardized_chrom, test_snps=test_snps_chrom, pheno=pheno,
                                covar=covar_chrom, G1_standardized=G1_standardized_chrom, mixing=mixing,
                                h2=h2, log_delta=log_delta, cache_file=None)

        frame_list.append(frame_chrom)

    frame = pd.concat(frame_list)
    frame.sort("PValue", inplace=True)
    frame.index = np.arange(len(frame))

    if output_file_name is not None:
        frame.to_csv(output_file_name, sep="\t", index=False)

    logging.info("PhenotypeName\t{0}".format(pheno['header']))
    logging.info("SampleSize\t{0}".format(test_snps.iid_count))
    logging.info("SNPCount\t{0}".format(test_snps.sid_count))
    logging.info("Runtime\t{0}".format(time.time()-t0))

    return frame
def single_snp_all_plus_select(test_snps, pheno, G=None, covar=None,
                 k_list = None,
                 n_folds=10, #1 is special and means test on train
                 seed = 0, output_file_name = None,
                 GB_goal=None, force_full_rank=False, force_low_rank=False, mixing=None, h2=None, do_plot=False, runner=None):
    """
    Function performing single SNP GWAS based on two kernels. The first kernel is based on all SNPs. The second kernel is a similarity matrix
    constructed of the top *k* SNPs where the SNPs are ordered via the PValue from :meth:`.single_snp` and *k* is determined via out-of-sample prediction.
    All work is done via 'leave_out_one_chrom', that one chromosome is tested and the kernels are constructed from the other chromosomes.
    Will reorder and intersect IIDs as needed.

    :param test_snps: SNPs to test. Can be any :class:`.SnpReader`. If you give a string, it should be the base name of a set of PLINK Bed-formatted files.
           (For backwards compatibility can also be dictionary with keys 'vals', 'iid', 'header')
    :type test_snps: a :class:`.SnpReader` or a string

    :param pheno: A single phenotype: Can be any :class:`.SnpReader`, for example, :class:`.Pheno` or :class:`.SnpData`.
           If you give a string, it should be the file name of a PLINK phenotype-formatted file.
           Any IIDs with missing values will be removed.
           (For backwards compatibility can also be dictionary with keys 'vals', 'iid', 'header')
    :type pheno: a :class:`.SnpReader` or a string

    :param G: SNPs from which to create a similarity matrix of the top *k* SNPs. If not given, will use test_snps.
           Can be any :class:`.SnpReader`. If you give a string, it should be the base name of a set of PLINK Bed-formatted files.
    :type G: :class:`.SnpReader` or a string

    :param covar: covariate information, optional: Can be any :class:`.SnpReader`, for example, :class:`.Pheno` or :class:`.SnpData`.
           If you give a string, it should be the file name of a PLINK phenotype-formatted file.
           (For backwards compatibility can also be dictionary with keys 'vals', 'iid', 'header')
    :type covar: a :class:`.SnpReader` or a string

    :param k_list: Values of *k* (in addition to 0) to test. Default to [1,2,4,8,...8192].
    :type k_list: list of numbers

    :param n_folds: Number of folds of cross validation to use for out-of-sample evaluation of various values of *k*. Default to 10.
    :type n_folds: number
    
    :param seed: (optional) Random seed used to generate permutations for lrt G0 fitting.
    :type seed: number

    :param output_file_name: Name of file to write results to, optional. If not given, no output file will be created.
    :type output_file_name: file name

    :param GB_goal: gigabytes of memory the run should use, optional. If not given, will read the test_snps in blocks the same size as the kernel,
        which is memory efficient with little overhead on computation time.
    :type GB_goal: number

    :param force_full_rank: Even if kernels are defined with fewer SNPs than IIDs, create an explicit iid_count x iid_count kernel. Cannot be True if force_low_rank is True.
    :type force_full_rank: Boolean

    :param force_low_rank: Even if kernels are defined with fewer IIDs than SNPs, create a low-rank iid_count x sid_count kernel. Cannot be True if force_full_rank is True.
    :type force_low_rank: Boolean

    :param mixing: A parameter to LMM learning telling how to combine the two kernels, optional
            If not given will search for best value.
    :type mixing: number

    :param h2: A parameter to LMM learning that tells how much weight to give the K's vs. the identity matrix, optional
            If not given will search for best value.
    :type h2: number

    :param do_plot: If true, will plot, for each chrom, the negative loglikelihood vs k.
    :type do_plot: boolean


    :param runner: a runner, optional: Tells how to run locally, multi-processor, or on a cluster.
        If not given, the function is run locally.
    :type runner: a runner.

    :rtype: Pandas dataframe with one row per test SNP. Columns include "PValue"


    :Example:

    >>> import logging
    >>> import numpy as np
    >>> from fastlmm.association import single_snp_all_plus_select
    >>> from pysnptools.snpreader import Bed
    >>> from fastlmm.util.runner import LocalMultiProc
    >>> logging.basicConfig(level=logging.INFO)
    >>> pheno_fn = "../feature_selection/examples/toydata.phe"
    >>> snps = Bed("../feature_selection/examples/toydata.5chrom.bed")[:,::100] #To make example faster, run on only 1/100th of the data
    >>> chrom5_snps = snps[:,snps.pos[:,0]==5] # Test on only chrom5
    >>> results_dataframe = single_snp_all_plus_select(test_snps=chrom5_snps,G=snps,pheno=pheno_fn,GB_goal=2,runner=LocalMultiProc(20,mkl_num_threads=5)) #Run multiproc
    >>> print results_dataframe.iloc[0].SNP,round(results_dataframe.iloc[0].PValue,7),len(results_dataframe)
    null_9800 0.0793397 4

    """

    #=================================================
    # Start of definition of inner functions
    #=================================================
    def _best_snps_for_each_chrom(chrom_list, input_files, runner, G, n_folds, seed, pheno, covar, force_full_rank, force_low_rank, mixing, h2, k_list, GB_goal):
        #logging.info("Doing GWAS_1K for each chrom and fold. Work_count={0}".format(len(chrom_list)*(n_folds+1)))

        max_k = int(max(k_list))
        assert np.array_equal(G.iid,pheno.iid) and np.array_equal(G.iid,covar.iid), "real assert"

        def mapper_find_best_given_chrom(test_chr):
            G_for_chrom = _K_per_chrom(G, test_chr, G.iid).snpreader
    
            def mapper_gather_lots(i_fold_and_pair):
                i_fold, (train_idx, test_idx) = i_fold_and_pair
                logging.info("Working on GWAS_1K and k search, chrom={0}, i_fold={1}".format(test_chr, i_fold))

                G_train = G_for_chrom[train_idx,:]

                #Precompute whole x whole standardized on train
                from fastlmm.association.single_snp import _internal_determine_block_size, _block_size_from_GB_goal
                min_count = _internal_determine_block_size(G_for_chrom, None, None, force_full_rank, force_low_rank)
                block_size = _block_size_from_GB_goal(GB_goal, G_for_chrom.iid_count, min_count)
                K_whole_unittrain = _SnpWholeWithTrain(whole=G_for_chrom,train_idx=train_idx, standardizer=Unit(), block_size=block_size).read()

                assert np.array_equal(K_whole_unittrain.iid,G_for_chrom.iid),"real assert"
                K_train = K_whole_unittrain[train_idx]
                    
                single_snp_result = single_snp(test_snps=G_train, K0=K_train, pheno=pheno, #iid intersection means when can give the whole covariate and pheno
                             covar=covar, leave_out_one_chrom=False,
                             GB_goal=GB_goal,  force_full_rank=force_full_rank, force_low_rank=force_low_rank, mixing=mixing, h2=h2)

                is_all = (i_fold == n_folds) if n_folds > 1 else True

                k_list_in =  [0] + [int(k) for k in k_list if 0 < k and k < len(single_snp_result)]

                if is_all:
                    top_snps = list(single_snp_result.SNP[:max_k])
                else:
                    top_snps = None

                if i_fold == n_folds:
                    k_index_to_nLL = None
                else:
                    k_index_to_nLL = []
                    for k in k_list_in:
                        top_k = G_for_chrom[:,G_for_chrom.sid_to_index(single_snp_result.SNP[:k])]
                        logging.info("Working on chr={0}, i_fold={1}, and K_{2}".format(test_chr,i_fold,k))

                        top_k_train = top_k[train_idx,:] if k > 0 else None
                        fastlmm = FastLMM(force_full_rank=force_full_rank, force_low_rank=force_low_rank,GB_goal=GB_goal)
                        fastlmm.fit(K0_train=K_train, K1_train=top_k_train, X=covar, y=pheno,mixing=mixing,h2=h2) #iid intersection means when can give the whole covariate and pheno
    
                        top_k_test = top_k[test_idx,:] if k > 0 else None
                        K0_whole_test = K_whole_unittrain[:,test_idx]
                        nLL = fastlmm.score(K0_whole_test=K0_whole_test,K1_whole_test=top_k_test,X=covar,y=pheno) #iid intersection means when can give the whole covariate and pheno
                        k_index_to_nLL.append(nLL)

                if i_fold > 0:
                    k_list_in = None
    
                return k_list_in, top_snps, k_index_to_nLL

            def reducer_find_best(top_snps_and_k_index_to_nLL_sequence):
                #Starts fold_index+all -> k_index -> nll
                #Need:  k_index -> sum(fold_index -> nll)

                k_index_to_sum_nll = None
                top_snps_all = None
                k_list_in_all = None
                for i_fold, (k_list_in, top_snps, k_index_to_nLL) in enumerate(top_snps_and_k_index_to_nLL_sequence):
                    if k_list_in is not None:
                        assert k_list_in_all is None, "real assert"
                        k_list_in_all = k_list_in
                        k_index_to_sum_nll = np.zeros(len(k_list_in))

                    if top_snps is not None:
                        assert top_snps_all is None, "real assert"
                        top_snps_all = top_snps

                    if k_index_to_nLL is not None:
                        assert i_fold < n_folds or n_folds == 1, "real assert"
                        for k_index, nLL in enumerate(k_index_to_nLL):
                            k_index_to_sum_nll[k_index] += nLL

                #find best # top_snps
                best_k = k_list_in_all[np.argmin(k_index_to_sum_nll)]
                logging.info("For chrom={0}, best_k={1}".format(test_chr,best_k))
                if do_plot: _nll_plot(k_list_in_all, k_index_to_sum_nll)

                #Return the top snps from all
                result = top_snps_all[:best_k]
                return result


            i_fold_index_to_top_snps_and_k_index_to_nLL = map_reduce(
                    _kfold(G_for_chrom.iid_count, n_folds, seed, end_with_all=True),
                    mapper=mapper_gather_lots,
                    reducer=reducer_find_best)
            return i_fold_index_to_top_snps_and_k_index_to_nLL

        chrom_index_to_best_sid = map_reduce(
                chrom_list,
                nested=mapper_find_best_given_chrom,
                input_files=input_files,
                name="best snps for each chrom",
                runner=runner)
        return chrom_index_to_best_sid


    def _gwas_2k_via_loo_chrom(test_snps, chrom_list, input_files, runner, G, chrom_index_to_best_sid, pheno, covar, force_full_rank, force_low_rank, mixing, h2, output_file_name, GB_goal):
        logging.info("Doing GWAS_2K for each chrom. Work_count={0}".format(len(chrom_list)))

        def mapper_single_snp_2K_given_chrom(test_chr):
            logging.info("Working on chr={0}".format(test_chr))
            test_snps_chrom = test_snps[:,test_snps.pos[:,0]==test_chr]
            G_for_chrom = _K_per_chrom(G, test_chr, G.iid).snpreader
            chrom_index = chrom_list.index(test_chr)
            best_sid = chrom_index_to_best_sid[chrom_index]
    
            K1 = G_for_chrom[:,G_for_chrom.sid_to_index(best_sid)]
            result = single_snp(test_snps=test_snps_chrom, K0=G_for_chrom, K1=K1, pheno=pheno,
                        covar=covar, leave_out_one_chrom=False, 
                        GB_goal=GB_goal,  force_full_rank=force_full_rank, force_low_rank=force_low_rank,mixing=mixing,h2=h2)
            return result
    
        def reducer_closure(frame_sequence): #!!!very similar code in single_snp
            frame = pd.concat(frame_sequence)
            frame.sort_values(by="PValue", inplace=True)
            frame.index = np.arange(len(frame))
            if output_file_name is not None:
                frame.to_csv(output_file_name, sep="\t", index=False)
            logging.info("PhenotypeName\t{0}".format(pheno.sid[0]))
            logging.info("SampleSize\t{0}".format(G.iid_count))
            logging.info("SNPCount\t{0}".format(G.sid_count))
    
            return frame
    
    
        frame = map_reduce(
            chrom_list,
            mapper=mapper_single_snp_2K_given_chrom,
            reducer=reducer_closure,
            input_files=input_files,
            name="single_snp with two K's for all chroms",
            runner=runner
            )
        return frame

    #=================================================
    # End of definition of inner functions
    #=================================================

    #!!!code similar to single_snp
    if force_full_rank and force_low_rank:
        raise Exception("Can't force both full rank and low rank")
    if k_list is None:
        k_list = np.logspace(start=0, stop=13, num=14, base=2)

    assert test_snps is not None, "test_snps must be given as input"
    test_snps = _snps_fixup(test_snps)
    G = _snps_fixup(G or test_snps)
    pheno = _pheno_fixup(pheno).read()
    assert pheno.sid_count == 1, "Expect pheno to be just one variable"
    pheno = pheno[(pheno.val==pheno.val)[:,0],:]
    covar = _pheno_fixup(covar, iid_if_none=pheno.iid)
    chrom_list = list(set(test_snps.pos[:,0])) # find the set of all chroms mentioned in test_snps, the main testing data
    G, test_snps, pheno, covar  = pstutil.intersect_apply([G, test_snps, pheno, covar])
    common_input_files = [test_snps, G, pheno, covar]

    chrom_index_to_best_sid = _best_snps_for_each_chrom(chrom_list, common_input_files, runner, G, n_folds, seed, pheno, covar, force_full_rank, force_low_rank, mixing, h2, k_list, GB_goal)

    frame = _gwas_2k_via_loo_chrom(test_snps, chrom_list, common_input_files, runner, G, chrom_index_to_best_sid, pheno, covar, force_full_rank, force_low_rank, mixing, h2, output_file_name, GB_goal)

    return frame
Exemplo n.º 58
0
    def fit(self, X=None, y=None, K0_train=None, K1_train=None, h2=None, mixing=None,count_A1=None):
        """
        Method for training a :class:`FastLMM` predictor. If the examples in X, y, K0_train, K1_train are not the same, they will be reordered and intersected.

        :param X: training covariate information, optional: 
          If you give a string, it should be the file name of a PLINK phenotype-formatted file.
        :type X: a PySnpTools :class:`SnpReader` (such as :class:`Pheno` or :class:`SnpData`) or string.

        :param y: training phenotype:
          If you give a string, it should be the file name of a PLINK phenotype-formatted file.
        :type y: a PySnpTools :class:`SnpReader` (such as :class:`Pheno` or :class:`SnpData`) or string.

        :param K0_train: Must be None. Represents the identity similarity matrix.
        :type K0_train: None

        :param K1_train: Must be None. Represents the identity similarity matrix.
        :type K1_train: :class:`.SnpReader` or a string or :class:`.KernelReader`

        :param h2: Ignored. Optional.
        :type h2: number

        :param mixing: Ignored. Optional.
        :type mixing: number

        :param count_A1: If it needs to read SNP data from a BED-formatted file, tells if it should count the number of A1
             alleles (the PLINK standard) or the number of A2 alleles. False is the current default, but in the future the default will change to True.
        :type count_A1: bool


        :rtype: self, the fitted Linear Regression predictor
        """
        self.is_fitted = True
        assert K0_train is None # could also accept that ID or no snps
        assert K1_train is None # could also accept that ID or no snps

        assert y is not None, "y must be given"

        y = _pheno_fixup(y,count_A1=count_A1)
        assert y.sid_count == 1, "Expect y to be just one variable"
        X = _pheno_fixup(X, iid_if_none=y.iid,count_A1=count_A1)

        X, y  = intersect_apply([X, y])
        y = y.read()
        X, covar_unit_trained = X.read().standardize(self.covariate_standardizer,return_trained=True)

        # add a column of 1's to cov to increase DOF of model (and accuracy) by allowing a constant offset
        X = SnpData(iid=X.iid,
                                sid=FastLMM._new_snp_name(X),
                                val=np.c_[X.val,np.ones((X.iid_count,1))])


        lsqSol = np.linalg.lstsq(X.val, y.val[:,0],rcond=-1)
        bs=lsqSol[0] #weights
        r2=lsqSol[1] #squared residuals
        D=lsqSol[2]  #rank of design matrix
        N=y.iid_count

        self.beta = bs
        self.ssres = float(r2)
        self.sstot = ((y.val-y.val.mean())**2).sum()
        self.covar_unit_trained = covar_unit_trained
        self.iid_count = X.iid_count
        self.covar_sid = X.sid
        self.pheno_sid = y.sid
        return self