示例#1
0
def set_snps0(SNPs0,sample_size,i_exclude=None, forcefullrank=False,blocksize=10000):
    '''
    In full rank case, loads up the SNPs in blocks, and construct the kernel.
    In low rank case, loads up all SNPs in to memory
    '''    
    if SNPs0 is None:
        return None, None
    if "K" in SNPs0:
        K0 = SNPs0["K"]
        G0 = None
    elif "data" in SNPs0:
        K0 = None
        G0 = SNPs0["data"]["snps"]
    else:        
        #full rank
        if len(SNPs0["snp_set"]) > sample_size or forcefullrank:# N = Y.shape[0]                      
            SNPs0["K"] = psd.build_kernel_blocked(snpreader=SNPs0["reader"], snp_idx=SNPs0["snp_set"].to_index, blocksize=blocksize,allowlowrank=forcefullrank)
            K0 = SNPs0["K"]
            G0 = None
        else:
            #low rank            
            K0 = None
            SNPs0["data"] = SNPs0["snp_set"].read()
            SNPs0["data"]["snps"] = up.standardize(SNPs0["data"]["snps"])
            G0 = SNPs0["data"]["snps"]

    #lrt_up should never do exclusion, because set_snps0 should only get called once, in run_once, without exclusion
    #exclude. So this is only for score test and lrt. 
    if i_exclude is not None:
        if K0 is not None:
            #Also note in the full rank case with exclusion, for score, one could in principle use low rank updates to make it faster,
            #when the number of excluded SNPs is small: it wold be cubic in num_excluded * num_inner*num_outer iterations, versus now
            #where it is cubic in N in the outer loop only once
            K_up = psd.build_kernel_blocked(snpreader=SNPs0["reader"], snp_idx=np.array(SNPs0["snp_set"].to_index)[i_exclude], blocksize=blocksize,allowlowrank=forcefullrank)
            K0 = K0 - K_up
        elif G0 is not None:
            G0 = G0[:,~i_exclude]                        
        num_snps = SNPs0["num_snps"] - i_exclude.sum()
    else:
        num_snps = SNPs0["num_snps"]
    #intersect data?
        
    #normalize:
    if K0 is not None:
        K0 = K0 / num_snps#K0.diagonal().mean()
    elif G0 is not None:
        G0 = G0 / np.sqrt( num_snps )#(G0*G0).mean() ) # computes the sqrt of the mean of the diagonal of K=GG^T; *  means pointwise multiplication 
    return G0, K0
示例#2
0
def set_snps0(SNPs0,sample_size,i_exclude=None, forcefullrank=False,blocksize=10000):
    '''
    In full rank case, loads up the SNPs in blocks, and construct the kernel.
    In low rank case, loads up all SNPs in to memory
    '''    
    if SNPs0 is None:
        return None, None
    if SNPs0.has_key("K"):
        K0 = SNPs0["K"]
        G0 = None
    elif SNPs0.has_key("data"):
        K0 = None
        G0 = SNPs0["data"]["snps"]
    else:        
        #full rank
        if len(SNPs0["snp_set"]) > sample_size or forcefullrank:# N = Y.shape[0]                      
            SNPs0["K"] = psd.build_kernel_blocked(snpreader=SNPs0["reader"], snp_idx=SNPs0["snp_set"].to_index, blocksize=blocksize,allowlowrank=forcefullrank)
            K0 = SNPs0["K"]
            G0 = None
        else:
            #low rank            
            K0 = None
            SNPs0["data"] = SNPs0["snp_set"].read()
            SNPs0["data"]["snps"] = up.standardize(SNPs0["data"]["snps"])
            G0 = SNPs0["data"]["snps"]

    #lrt_up should never do exclusion, because set_snps0 should only get called once, in run_once, without exclusion
    #exclude. So this is only for score test and lrt. 
    if i_exclude is not None:
        if K0 is not None:
            #Also note in the full rank case with exclusion, for score, one could in principle use low rank updates to make it faster,
            #when the number of excluded SNPs is small: it wold be cubic in num_excluded * num_inner*num_outer iterations, versus now
            #where it is cubic in N in the outer loop only once
            K_up = psd.build_kernel_blocked(snpreader=SNPs0["reader"], snp_idx=np.array(SNPs0["snp_set"].to_index)[i_exclude], blocksize=blocksize,allowlowrank=forcefullrank)
            K0 = K0 - K_up
        elif G0 is not None:
            G0 = G0[:,~i_exclude]                        
        num_snps = SNPs0["num_snps"] - i_exclude.sum()
    else:
        num_snps = SNPs0["num_snps"]
    #intersect data?
        
    #normalize:
    if K0 is not None:
        K0 = K0 / num_snps#K0.diagonal().mean()
    elif G0 is not None:
        G0 = G0 / np.sqrt( num_snps )#(G0*G0).mean() ) # computes the sqrt of the mean of the diagonal of K=GG^T; *  means pointwise multiplication 
    return G0, K0
示例#3
0
def computePC(file, filepath = None, numpc = [5]):
    if filepath is not None:
        fn = os.path.join(filepath,file)
    else:
        fn = file
    if type(numpc) is int or type(numpc) is float:
        numpc = [numpc]
    alt_snpreader = Bed(fn)
    print "computing K"
    K = dist.build_kernel_blocked(fn,alt_snpreader=alt_snpreader)
    print "computing the Eigenvalue decomposition of K"
    [s_all,u_all] = LA.eigh(K)
    s_all=s_all[::-1]
    u_all=u_all[:,::-1]
    for numpcs in numpc:
        #import pdb; pdb.set_trace()
        print "saving %i PCs from %s" %(numpcs,fn)
        
        #import pdb; pdb.set_trace()

        s=s_all[0:numpcs]
        u = u_all[:,0:numpcs]
        outu = np.zeros((u_all.shape[0],numpcs+2),dtype = "|S20")
        outu[:,0:2] = alt_snpreader.original_iids
        outu[:,2::]=u
        fnout = getEigvecs_fn(fn,numpcs)
    
        np.savetxt(fnout,outu,fmt="%s",delimiter = "\t")
        fnout = "%s_pc%i.vals"%(fn,numpcs)
        #outs = np.zeros((s.shape[0],u.shape[1]+2),dtype = "|S20")
        np.savetxt(fnout,s,fmt="%.5f",delimiter = "\t")
    return s_all,u_all
示例#4
0
def computePC(file, filepath=None, numpc=[5]):
    if filepath is not None:
        fn = os.path.join(filepath, file)
    else:
        fn = file
    if type(numpc) is int or type(numpc) is float:
        numpc = [numpc]
    alt_snpreader = Bed(fn)
    print "computing K"
    K = dist.build_kernel_blocked(fn, alt_snpreader=alt_snpreader)
    print "computing the Eigenvalue decomposition of K"
    [s_all, u_all] = LA.eigh(K)
    s_all = s_all[::-1]
    u_all = u_all[:, ::-1]
    for numpcs in numpc:
        #import pdb; pdb.set_trace()
        print "saving %i PCs from %s" % (numpcs, fn)

        #import pdb; pdb.set_trace()

        s = s_all[0:numpcs]
        u = u_all[:, 0:numpcs]
        outu = np.zeros((u_all.shape[0], numpcs + 2), dtype="|S20")
        outu[:, 0:2] = alt_snpreader.original_iids
        outu[:, 2::] = u
        fnout = getEigvecs_fn(fn, numpcs)

        np.savetxt(fnout, outu, fmt="%s", delimiter="\t")
        fnout = "%s_pc%i.vals" % (fn, numpcs)
        #outs = np.zeros((s.shape[0],u.shape[1]+2),dtype = "|S20")
        np.savetxt(fnout, s, fmt="%.5f", delimiter="\t")
    return s_all, u_all
示例#5
0
def create_feature_selection_distributable(snp_reader,
                                           phen_fn,
                                           pc_fn,
                                           num_pcs_kernel,
                                           output_prefix,
                                           cov_fn=None,
                                           include_all=True):

    from fastlmm.feature_selection import FeatureSelectionStrategy
    import fastlmm.feature_selection.PerformSelectionDistributable as psd

    # set up parameters
    num_folds = 10
    random_state = 42
    num_snps_in_memory = 1000000

    ##############################
    num_steps_delta = 7
    num_steps_k = 7
    num_steps_mix = 7

    # log_2 space and all SNPs
    k_values = [
        int(k)
        for k in np.logspace(0, 10, base=2, num=num_steps_k, endpoint=True)
    ]
    if include_all:
        k_values.append(snp_reader.sid_count)
    delta_values = np.logspace(-5,
                               10,
                               endpoint=True,
                               num=num_steps_delta,
                               base=np.exp(1))

    if pc_fn is None:
        assert num_pcs_kernel == 0
        logging.info(
            "feature selection: no PCs specified, disabling loop over mixing parameter"
        )

    strategy = "insample_cv"
    select_by_ll = True

    # go!
    feature_selector = FeatureSelectionStrategy(
        snp_reader,
        phen_fn,
        num_folds,
        random_state=random_state,
        num_snps_in_memory=num_snps_in_memory,
        interpolate_delta=False,
        cov_fn=cov_fn)
    perform_selection_distributable = psd.PerformSelectionDistributable(
        feature_selector, k_values, delta_values, strategy, output_prefix,
        select_by_ll)

    return perform_selection_distributable
示例#6
0
    def perform_selection(self, k_values, delta_values, strategy="lmm_full_cv", output_prefix=None, select_by_ll=False, runner=Local(),penalty=0.0,create_pdf=True):
        """Perform feature selection

        k_values : array-like, shape = [n_steps_k]
            Array of k values to test

        delta_values : array-like, shape = [n_steps_delta]
            Array of delta values to test

        strategy : {'lmm_full_cv', 'insample_cv'}
            Strategy to perform feature selection:

            - 'lmm_full_cv' perform cross-validation over grid of k and delta using LMM
              
            - 'insample_cv' perform cross-validation over grid of k, estimate delta in sample
              using maximum likelihood.

        output_prefix : str, optional, default=None
            Prefix for output files

        select_by_ll : bool, default=False
            If set to True, negative log-likelihood will be used to select best k and delta


        Returns
        -------
        best_k : int
            best subset size k

        best_delta : float
            best regularization parameter delta for ridge regression

        best_obj : float
            best objective at optimum (default MSE, nLL if select_by_ll flag is set), 

        best_snps : list[str]
            list of ids of best snps (univariate selection done on whole data set using best_k, best_delta)

        """
        with patch.dict('os.environ', {'ARRAY_MODULE': 'numpy'}) as _:

            self.biggest_k = max(k_values)
        
            if (strategy!="lmm_full_cv") and (strategy!="insample_cv"):
                logging.warning("strategies other than lmm_full_cv and insample_cv are experimental!")
                raise Exception("strategies other than lmm_full_cv and insample_cv are experimental!")

            perform_selection_distributable = psd.PerformSelectionDistributable(self, k_values, delta_values, strategy, output_prefix, select_by_ll, penalty=penalty,create_pdf=create_pdf)
            result = runner.run(perform_selection_distributable)
            return result
示例#7
0
    def blocking(self, snpreader, cov_fn=None, num_pcs=0, output_prefix = None, strategy="lmm_full_cv"):
        """
        compare three different cases:

        To control memory use, we've introduced a parameter called "num_snps_in_memory", which defaults to 100000. 
        Here are the interesting cases to consider (and choose num_snps_in_memory accordingly):

        1) num_snps_in_memory > total_num_snps

           In this case, the same code as before should be 
           executed (except the kernel matrix on all SNPs is now cached). 


        2) num_snps_in_memory < total_num_snps
            num_snps_in_memory > k (excluding all_snps)

            Here, the linear regression will be blocked, 
            while the data for cross-validation is cached, 
            saving time for loading and re-indexing.


        3) num_snps_in_memory < total_num_snps
            num_snps_in_memory < k (excluding all_snps)

            Finally, both operations - linear regression 
            and building the kernel will be blocked.

        4,5,6) Same as #1,2,3, but with a phenos that has extra iids and for which the iids are shuffled.


        """

        # set up grid
        ##############################
        num_steps_delta = 5
        num_folds = 2

        # log_2 space and all SNPs
        k_values = [0, 1, 5, 10, 100, 500, 700, 10000] 
        delta_values = np.logspace(-3, 3, endpoint=True, num=num_steps_delta, base=np.exp(1))
        
        random_state = 42


        # case 1
        fss_1 = FeatureSelectionStrategy(snpreader, self.pheno_fn, num_folds, cov_fn=cov_fn, random_state=random_state, num_pcs=num_pcs, interpolate_delta=True, num_snps_in_memory=20000,count_A1=False)
        best_k_1, best_delta_1, best_obj_1, best_snps_1 = fss_1.perform_selection(k_values, delta_values, output_prefix=output_prefix, select_by_ll=True, strategy=strategy,create_pdf=False)

        #some misc testing
        from fastlmm.feature_selection import PerformSelectionDistributable as psd
        perform_selection_distributable = psd.PerformSelectionDistributable(fss_1, k_values, delta_values, strategy, output_prefix, select_by_ll=True, penalty=0.0,create_pdf=False)
        self.assertEqual(perform_selection_distributable.work_count, 3)
        s = perform_selection_distributable.tempdirectory
        s = str(perform_selection_distributable)
        s = "%r" % perform_selection_distributable
        from fastlmm.feature_selection.feature_selection_cv import GClass
        s = "%r" % GClass.factory(snpreader,1000000, Unit(), 50,count_A1=False)
        s = s
        #!!making  test for each break point.


        # case 2
        fss_2 = FeatureSelectionStrategy(snpreader, self.pheno_fn, num_folds, cov_fn=cov_fn, random_state=random_state, num_pcs=num_pcs, interpolate_delta=True, num_snps_in_memory=5000,count_A1=False)
        best_k_2, best_delta_2, best_obj_2, best_snps_2 = fss_2.perform_selection(k_values, delta_values, output_prefix=output_prefix, select_by_ll=True, strategy=strategy,create_pdf=False)

        # case 3
        fss_3 = FeatureSelectionStrategy(snpreader, self.pheno_fn, num_folds, cov_fn=cov_fn, random_state=random_state, num_pcs=num_pcs, interpolate_delta=True, num_snps_in_memory=600,count_A1=False)
        best_k_3, best_delta_3, best_obj_3, best_snps_3 = fss_3.perform_selection(k_values, delta_values, output_prefix=output_prefix, select_by_ll=True, strategy=strategy,create_pdf=False)

        # case 4
        fss_4 = FeatureSelectionStrategy(snpreader, self.pheno_shuffleplus_fn, num_folds, cov_fn=cov_fn, random_state=random_state, num_pcs=num_pcs, interpolate_delta=True, num_snps_in_memory=20000,count_A1=False)
        best_k_4, best_delta_4, best_obj_4, best_snps_4 = fss_4.perform_selection(k_values, delta_values, output_prefix=output_prefix, select_by_ll=True, strategy=strategy,create_pdf=False)

        # case 5
        fss_5 = FeatureSelectionStrategy(snpreader, self.pheno_shuffleplus_fn, num_folds, cov_fn=cov_fn, random_state=random_state, num_pcs=num_pcs, interpolate_delta=True, num_snps_in_memory=5000,count_A1=False)
        best_k_5, best_delta_5, best_obj_5, best_snps_5 = fss_5.perform_selection(k_values, delta_values, output_prefix=output_prefix, select_by_ll=True, strategy=strategy,create_pdf=False)

        # case 6
        fss_6 = FeatureSelectionStrategy(snpreader, self.pheno_shuffleplus_fn, num_folds, cov_fn=cov_fn, random_state=random_state, num_pcs=num_pcs, interpolate_delta=True, num_snps_in_memory=600,count_A1=False)
        best_k_6, best_delta_6, best_obj_6, best_snps_6 = fss_6.perform_selection(k_values, delta_values, output_prefix=output_prefix, select_by_ll=True, strategy=strategy,create_pdf=False)

        self.assertEqual(int(best_k_1), int(best_k_2))
        self.assertEqual(int(best_k_1), int(best_k_3))
        #self.assertEqual(int(best_k_1), int(best_k_4))
        #self.assertEqual(int(best_k_1), int(best_k_5))
        #self.assertEqual(int(best_k_1), int(best_k_6))
        self.assertAlmostEqual(best_obj_1, best_obj_2)
        self.assertAlmostEqual(best_obj_1, best_obj_3)
        #self.assertAlmostEqual(best_obj_1, best_obj_4)
        self.assertAlmostEqual(best_obj_4, best_obj_5)
        self.assertAlmostEqual(best_obj_4, best_obj_6)

        if strategy != "insample_cv":
            self.assertAlmostEqual(best_delta_1, best_delta_2)
            self.assertAlmostEqual(best_delta_1, best_delta_3)
            #self.assertAlmostEqual(best_delta_1, best_delta_4)
            self.assertAlmostEqual(best_delta_4, best_delta_5)
            self.assertAlmostEqual(best_delta_4, best_delta_6)