示例#1
0
文件: test.py 项目: hyacz/PySnpTools
    def test_diagKtoN(self):
        """
        make sure standardization on SNPs results in sum(diag(K))=N
        """

        np.random.seed(42)
        m = np.random.random((100, 1000))
        from pysnptools.standardizer import DiagKtoN
        s = DiagKtoN()
        s.standardize(m)
        K = m.dot(m.T)
        sum_diag = np.sum(np.diag(K))

        np.testing.assert_almost_equal(100, sum_diag)
示例#2
0
 def test_diagKtoN(self):
     """
     make sure standardization on SNPs results in sum(diag(K))=N
     """
     
     np.random.seed(42)
     m = np.random.random((100,1000))
     from pysnptools.standardizer import DiagKtoN
     s = DiagKtoN()
     s.standardize(m)
     K = m.dot(m.T)
     sum_diag = np.sum(np.diag(K))
     
     np.testing.assert_almost_equal(100, sum_diag)
示例#3
0
文件: test.py 项目: hyacz/PySnpTools
    def test_some_std(self):
        k0 = self.snpdata.read_kernel(standardizer=Unit()).val
        from pysnptools.kernelreader import SnpKernel
        k1 = self.snpdata.read_kernel(standardizer=Unit())
        np.testing.assert_array_almost_equal(k0, k1.val, decimal=10)

        from pysnptools.snpreader import SnpData
        snpdata2 = SnpData(iid=self.snpdata.iid,
                           sid=self.snpdata.sid,
                           pos=self.snpdata.pos,
                           val=np.array(self.snpdata.val))
        s = str(snpdata2)
        snpdata2.standardize()
        s = str(snpdata2)

        snpreader = Bed(self.currentFolder + "/examples/toydata",
                        count_A1=False)
        k2 = snpreader.read_kernel(standardizer=Unit(), block_size=500).val
        np.testing.assert_array_almost_equal(k0, k2, decimal=10)

        from pysnptools.standardizer.identity import Identity
        from pysnptools.standardizer.diag_K_to_N import DiagKtoN
        for dtype in [sp.float64, sp.float32]:
            for std in [Unit(), Beta(1, 25), Identity(), DiagKtoN()]:
                s = str(std)
                np.random.seed(0)
                x = np.array(np.random.randint(3, size=[60, 100]), dtype=dtype)
                x2 = x[:, ::2]
                x2b = np.array(x2)
                #LATER what's this about? It doesn't do non-contiguous?
                #assert not x2.flags['C_CONTIGUOUS'] and not x2.flags['F_CONTIGUOUS'] #set up to test non contiguous
                #assert x2b.flags['C_CONTIGUOUS'] or x2b.flags['F_CONTIGUOUS'] #set up to test non contiguous
                #a,b = std.standardize(x2b),std.standardize(x2)
                #np.testing.assert_array_almost_equal(a,b)
        logging.info("done")
示例#4
0
 def __init__(self,
              GB_goal=None,
              force_full_rank=False,
              force_low_rank=False,
              snp_standardizer=Unit(),
              covariate_standardizer=Unit(),
              kernel_standardizer=DiagKtoN()):
     self.GB_goal = GB_goal
     self.force_full_rank = force_full_rank
     self.force_low_rank = force_low_rank
     self.snp_standardizer = snp_standardizer
     self.covariate_standardizer = covariate_standardizer
     self.kernel_standardizer = kernel_standardizer
     self.is_fitted = False
示例#5
0
def _internal_single(K0, test_snps, pheno, covar, K1,
                 mixing, h2, log_delta,
                 cache_file, force_full_rank, force_low_rank,
                 output_file_name, block_size, interact_with_snp, runner):
    assert K0 is not None, "real assert"
    assert K1 is not None, "real assert"
    assert block_size is not None, "real assert"
    assert mixing is None or 0.0 <= mixing <= 1.0
    if force_full_rank and force_low_rank:
        raise Exception("Can't force both full rank and low rank")

    assert h2 is None or log_delta is None, "if h2 is specified, log_delta may not be specified"
    if log_delta is not None:
        h2 = 1.0/(np.exp(log_delta)+1)

    covar = np.c_[covar.read(view_ok=True,order='A').val,np.ones((test_snps.iid_count, 1))]  #view_ok because np.c_ will allocation new memory

    y =  pheno.read(view_ok=True,order='A').val #view_ok because this code already did a fresh read to look for any missing values 

    if cache_file is not None and os.path.exists(cache_file):
        lmm = lmm_cov(X=covar, Y=y, G=None, K=None)
        with np.load(cache_file) as data: #!! similar code in epistasis
            lmm.U = data['arr_0']
            lmm.S = data['arr_1']
            h2 = data['arr_2'][0]
            mixing = data['arr_2'][1]
    else:
        K, h2, mixer = _Mixer.combine_the_best_way(K0, K1, covar, y, mixing, h2, force_full_rank=force_full_rank, force_low_rank=force_low_rank,kernel_standardizer=DiagKtoN())
        mixing = mixer.mixing

        if mixer.do_g:
            lmm = lmm_cov(X=covar, Y=y, K=None, G=K.snpreader.val, inplace=True)
        else:
            #print(covar.sum(),y.sum(),K.val.sum(),covar[0],y[0],K.val[0,0])
            lmm = lmm_cov(X=covar, Y=y, K=K.val, G=None, inplace=True)

        if h2 is None:
            result = lmm.findH2()
            h2 = result['h2']
        logging.info("h2={0}".format(h2))

        if cache_file is not None and not os.path.exists(cache_file):
            pstutil.create_directory_if_necessary(cache_file)
            lmm.getSU()
            np.savez(cache_file, lmm.U,lmm.S,np.array([h2,mixing])) #using np.savez instead of pickle because it seems to be faster to read and write

    if interact_with_snp is not None:
        logging.info("interaction with %i" % interact_with_snp)
        assert 0 <= interact_with_snp and interact_with_snp < covar.shape[1]-1, "interact_with_snp is out of range"
        interact = covar[:,interact_with_snp].copy()
        interact -=interact.mean()
        interact /= interact.std()
    else:
        interact = None

    work_count = -(test_snps.sid_count // -block_size) #Find the work count based on batch size (rounding up)

    # We define three closures, that is, functions define inside function so that the inner function has access to the local variables of the outer function.
    def debatch_closure(work_index):
        return test_snps.sid_count * work_index // work_count

    def mapper_closure(work_index):
        if work_count > 1: logging.info("single_snp: Working on snp block {0} of {1}".format(work_index,work_count))
        do_work_time = time.time()
        start = debatch_closure(work_index)
        end = debatch_closure(work_index+1)

        snps_read = test_snps[:,start:end].read().standardize()
        if interact_with_snp is not None:
            variables_to_test = snps_read.val * interact[:,np.newaxis]
        else:
            variables_to_test = snps_read.val
        res = lmm.nLLeval(h2=h2, dof=None, scale=1.0, penalty=0.0, snps=variables_to_test)

        beta = res['beta']
        
        chi2stats = beta*beta/res['variance_beta']
        #p_values = stats.chi2.sf(chi2stats,1)[:,0]
        assert test_snps.iid_count == lmm.U.shape[0]
        p_values = stats.f.sf(chi2stats,1,lmm.U.shape[0]-(lmm.linreg.D+1))[:,0]#note that G.shape is the number of individuals#

        dataframe = _create_dataframe(snps_read.sid_count)
        dataframe['sid_index'] = np.arange(start,end)
        dataframe['SNP'] = snps_read.sid
        dataframe['Chr'] = snps_read.pos[:,0]
        dataframe['GenDist'] = snps_read.pos[:,1]
        dataframe['ChrPos'] = snps_read.pos[:,2] 
        dataframe['PValue'] = p_values
        dataframe['SnpWeight'] = beta[:,0]
        dataframe['SnpWeightSE'] = np.sqrt(res['variance_beta'][:,0])
        dataframe['SnpFractVarExpl'] = np.sqrt(res['fraction_variance_explained_beta'][:,0])
        dataframe['Mixing'] = np.zeros((snps_read.sid_count)) + mixing
        dataframe['Nullh2'] = np.zeros((snps_read.sid_count)) + h2

        logging.info("time={0}".format(time.time()-do_work_time))

        #logging.info(dataframe)
        return dataframe

    def reducer_closure(result_sequence):
        if output_file_name is not None:
            create_directory_if_necessary(output_file_name)

        frame = pd.concat(result_sequence)
        frame.sort_values(by="PValue", inplace=True)
        frame.index = np.arange(len(frame))

        if output_file_name is not None:
            frame.to_csv(output_file_name, sep="\t", index=False)

        return frame

    frame = map_reduce(xrange(work_count),
                       mapper=mapper_closure,reducer=reducer_closure,
                       input_files=[test_snps],output_files=[output_file_name],
                       name="single_snp(output_file={0})".format(output_file_name),
                       runner=runner)
    return frame
示例#6
0
    f_handle.write("\n")


#for i in range(1, 5):

for i in range(1, res.shape[0]+1):
    ro.globalenv['i'] = i
    #keep_list2 = ro.r('keep_list2<-c(snp_list[i], keep_list)')
    #keep_list2 = ro.r('keep_list2<-c(snp_list2[snp_list2[,1]==snp_list2[snp_list2[,3]==snp_list[i],][1],3], keep_list)')
    keep_list2 = ro.r('keep_list2<-c(snp_list[i], colnames(X_data)[which(colnames(X_data)==snp_list[i])+1],colnames(X_data)[which(colnames(X_data)==snp_list[i])-1], keep_list)')
    G1 = np.array(ro.r('XX<-as.matrix(X_data[,colnames(X_data)%in%keep_list2])/(sqrt(length(keep_list2)))'))
    
    #norm_factor = 1./np.sqrt((G1**2).sum() / float(G1.shape[0]))
    #G1_standardized_val = norm_factor * G1
    from pysnptools.standardizer import DiagKtoN
    G1_standardized_val = DiagKtoN(G1.shape[0]).standardize(G1)
    #G1_standardized_val = G1
    
    lmmB = lmm
    W = G1_standardized_val.copy()
    UGup,UUGup = lmmB.rotate(W)
    i_up = np.zeros((W.shape[1]), dtype=np.bool)
    i_G1 = np.ones((W.shape[1]), dtype=np.bool)
    result = lmmB.findH2_2K(nGridH2=10, minH2=0.0, maxH2=0.99999, i_up=i_up, i_G1=i_G1, UW=UGup, UUW=UUGup)
    m2 = result['nLL'][0]*-1
    if result['h2'] > -1:
        h2 = result['h2']
    else:
        h2 = result['h2'][0]
    if result['h2_1'] > -1:
        h2_1 = result['h2_1']
def compute_core(input_tuple):
    """
    Leave-two-chromosome-out evaluation scheme:
    Chr1: no causals, used for T1-error evaluation
    Chr2: has causals, not conditioned on, used for power evaluation
    Rest: has causals, conditioned on
    
      T1   Pow  [     cond     ] 
    ===== ===== ===== .... =====
            x x   x x      xx
    
    """

    methods, snp_fn, eigen_fn, num_causal, num_pcs, seed, sim_id = input_tuple

    # partially load bed file
    from pysnptools.snpreader import Bed
    snp_reader = Bed(snp_fn)

    # determine indices for generation and evaluation
    ##################################################################
    chr1_idx, chr2_idx, rest_idx = split_data_helper.split_chr1_chr2_rest(
        snp_reader.pos)

    causal_candidates_idx = np.concatenate((chr2_idx, rest_idx))
    # only compute t1-error (condition on all chr with causals on them)
    #causal_candidates_idx = rest_idx
    test_idx = np.concatenate((chr1_idx, chr2_idx))

    if seed is not None:
        np.random.seed(int(seed % sys.maxint))

    causal_idx = np.random.permutation(causal_candidates_idx)[0:num_causal]

    # generate phenotype
    ###################################################################
    genetic_var = 0.5
    noise_var = 0.5

    y = generate_phenotype(
        Bed(snp_fn).read(order='C').standardize(), causal_idx, genetic_var,
        noise_var)
    y.flags.writeable = False

    ############### only alter part until here --> modularize this

    # load pcs
    ###################################################################
    logging.info("loading eigendecomp from file %s" % eigen_fn)
    eig_dec = load(eigen_fn)
    G_pc = eig_dec["pcs"]
    G_pc.flags.writeable = False

    G_pc_ = G_pc[:, 0:num_pcs]
    G_pc_norm = DiagKtoN(G_pc_.shape[0]).standardize(G_pc_.copy())
    G_pc_norm.flags.writeable = False

    # run feature selection
    #########################################################

    # generate pheno data structure
    pheno = {"iid": snp_reader.iid, "vals": y, "header": []}
    covar = {"iid": snp_reader.iid, "vals": G_pc_norm, "header": []}

    # subset readers
    G0 = snp_reader[:, rest_idx]
    test_snps = snp_reader[:, test_idx]

    result = {}
    fs_result = {}

    # additional methods can be defined and included in the benchmark
    for method_function in methods:
        result_, fs_result_ = method_function(test_snps, pheno, G0, covar)
        result.update(result_)
        fs_result.update(fs_result_)

    # save indices
    indices = {
        "causal_idx": causal_idx,
        "chr1_idx": chr1_idx,
        "chr2_idx": chr2_idx,
        "input_tuple": input_tuple,
        "fs_result": fs_result
    }
    #test_idx

    return result, indices
示例#8
0
def _internal_single(
        G0_standardized,
        test_snps,
        pheno,
        covar,
        G1_standardized,
        mixing,  #!!test mixing and G1
        h2,
        log_delta,
        cache_file,
        interact_with_snp=None):

    assert h2 is None or log_delta is None, "if h2 is specified, log_delta may not be specified"
    if log_delta is not None:
        h2 = 1.0 / (np.exp(log_delta) + 1)

    covar = np.hstack((covar['vals'], np.ones(
        (test_snps.iid_count, 1))))  #We always add 1's to the end.
    y = pheno['vals']

    from pysnptools.standardizer import DiagKtoN

    assert mixing is None or 0.0 <= mixing <= 1.0

    if cache_file is not None and os.path.exists(cache_file):
        lmm = fastLMM(X=covar, Y=y, G=None, K=None)
        with np.load(cache_file) as data:  #!! similar code in epistasis
            lmm.U = data['arr_0']
            lmm.S = data['arr_1']
    else:
        # combine two kernels (normalize kernels to diag(K)=N
        G0_standardized_val = DiagKtoN(
            G0_standardized.val.shape[0]).standardize(G0_standardized.val)
        G1_standardized_val = DiagKtoN(
            G1_standardized.val.shape[0]).standardize(G1_standardized.val)

        if mixing == 0.0 or G1_standardized.sid_count == 0:
            G = G0_standardized.val
        elif mixing == 1.0 or G0_standardized.sid_count == 0:
            G = G1_standardized.val
        else:
            G = np.empty(
                (G0_standardized.iid_count,
                 G0_standardized.sid_count + G1_standardized.sid_count))
            if mixing is None:
                mixing, h2 = _find_mixing(G, covar, G0_standardized_val,
                                          G1_standardized_val, h2, y)
            _mix(G, G0_standardized_val, G1_standardized_val, mixing)

        #TODO: make sure low-rank case is handled correctly
        lmm = fastLMM(X=covar, Y=y, G=G, K=None, inplace=True)

    if h2 is None:
        result = lmm.findH2()
        h2 = result['h2']
    logging.info("h2={0}".format(h2))

    snps_read = test_snps.read().standardize()

    if interact_with_snp is not None:
        print "interaction with %i" % interact_with_snp
        interact = covar[:, interact_with_snp]
        interact -= interact.mean()
        interact /= interact.std()
        variables_to_test = snps_read.val * interact[:, np.newaxis]
    else:
        variables_to_test = snps_read.val
    res = lmm.nLLeval(h2=h2,
                      dof=None,
                      scale=1.0,
                      penalty=0.0,
                      snps=variables_to_test)

    if cache_file is not None and not os.path.exists(cache_file):
        pstutil.create_directory_if_necessary(cache_file)
        np.savez(
            cache_file, lmm.U, lmm.S
        )  #using np.savez instead of pickle because it seems to be faster to read and write

    beta = res['beta']

    chi2stats = beta * beta / res['variance_beta']
    #p_values = stats.chi2.sf(chi2stats,1)[:,0]
    if G0_standardized is not None:
        assert G.shape[0] == lmm.U.shape[0]
    p_values = stats.f.sf(
        chi2stats, 1, lmm.U.shape[0] - 3
    )[:,
      0]  #note that G.shape is the number of individuals and 3 is the number of fixed effects (covariates+SNP)

    items = [('SNP', snps_read.sid), ('Chr', snps_read.pos[:, 0]),
             ('GenDist', snps_read.pos[:, 1]), ('ChrPos', snps_read.pos[:, 2]),
             ('PValue', p_values), ('SnpWeight', beta[:, 0]),
             ('SnpWeightSE', np.sqrt(res['variance_beta'][:, 0])),
             ('SnpFractVarExpl',
              np.sqrt(res['fraction_variance_explained_beta'][:, 0])),
             ('Nullh2', np.zeros((snps_read.sid_count)) + h2)]
    frame = pd.DataFrame.from_items(items)

    return frame