Exemplo n.º 1
0
def generate_test_data_w_sum_stats(h2=0.5, n=100000, n_sample=100, m=50000, model='gaussian', 
                                         p=1.0, conseq_r2=0, m_ld_chunk_size=100):
    """
    Generate 
    """
    #Get LD sample matrix
    D_sample = genotypes.get_sample_D(200,conseq_r2=conseq_r2,m=m_ld_chunk_size)
    
    #Simulate beta_hats
    ret_dict = simulate_beta_hats(h2=h2, n=n, n_sample=n_sample, m=m, model=model, p=p, 
                                    conseq_r2=conseq_r2, m_ld_chunk_size=m_ld_chunk_size, D_sample=D_sample)
    
    #Simulate test genotypes
    test_snps = genotypes.simulate_genotypes_w_ld(n_sample=n_sample, m=m, conseq_r2=conseq_r2, 
                                                  m_ld_chunk_size=m_ld_chunk_size)
    ret_dict['test_snps'] = test_snps
    
    #Simulate test phenotypes
    phen_noise = stats.norm.rvs(0, sp.sqrt(1.0 - h2), size=n_sample) 
    phen_noise = sp.sqrt((1.0 - h2) / sp.var(phen_noise)) * phen_noise
    genetic_part = sp.dot(test_snps.T, ret_dict['betas'])
    genetic_part = sp.sqrt(h2 / sp.var(genetic_part)) * genetic_part
    test_phen = genetic_part + phen_noise
    ret_dict['test_phen'] = test_phen
    return ret_dict
Exemplo n.º 2
0
def generate_test_data_w_sum_stats(h2=0.5,
                                   n=100000,
                                   n_sample=100,
                                   m=50000,
                                   model='gaussian',
                                   p=1.0,
                                   conseq_r2=0,
                                   m_ld_chunk_size=100):
    """
    Generate 
    """
    #Get LD sample matrix
    D_sample = genotypes.get_sample_D(200,
                                      conseq_r2=conseq_r2,
                                      m=m_ld_chunk_size)

    #Simulate beta_hats
    ret_dict = simulate_beta_hats(h2=h2,
                                  n=n,
                                  n_sample=n_sample,
                                  m=m,
                                  model=model,
                                  p=p,
                                  conseq_r2=conseq_r2,
                                  m_ld_chunk_size=m_ld_chunk_size,
                                  D_sample=D_sample)

    #Simulate test genotypes
    test_snps = genotypes.simulate_genotypes_w_ld(
        n_sample=n_sample,
        m=m,
        conseq_r2=conseq_r2,
        m_ld_chunk_size=m_ld_chunk_size)
    ret_dict['test_snps'] = test_snps

    #Simulate test phenotypes
    phen_noise = stats.norm.rvs(0, sp.sqrt(1.0 - h2), size=n_sample)
    phen_noise = sp.sqrt((1.0 - h2) / sp.var(phen_noise)) * phen_noise
    genetic_part = sp.dot(test_snps.T, ret_dict['betas'])
    genetic_part = sp.sqrt(h2 / sp.var(genetic_part)) * genetic_part
    test_phen = genetic_part + phen_noise
    ret_dict['test_phen'] = test_phen
    return ret_dict
Exemplo n.º 3
0
def simulate_traits(n=1000, m=100, hdf5_file_prefix=None, hdf5_group=None,
                    num_traits=1000, h2=0.5, effect_prior='gaussian', p=1.0,
                    conseq_ld=0, overwrite_hdf5=False, test_n=1000, simulate_validation_traits=True):
    """
    Simluate traits:
    First simulate SNPs, then simulate the traits
    
    """
    
    print "Using %d SNPs to simulate traits for %d individuals." % (m, n)
    
    genotype_dict = genotypes.simulate_genotypes_w_ld(n=n, m=m, ld=conseq_ld, return_ne=False, ld_window_size=0)
    snps = genotype_dict['X']
    betas_list = []
    betas_marg_list = []
    phen_list = []
    for i in range(num_traits):
        if effect_prior == 'gaussian':
            if p == 1.0:
                betas = stats.norm.rvs(0, sp.sqrt(h2 / m), size=m)
            else:
                M = int(round(m * p))
                betas = sp.concatenate((stats.norm.rvs(0, sp.sqrt(h2 / M), size=M), sp.zeros(m - M, dtype=float)))
        elif effect_prior == 'laplace':
            if p == 1.0:
                betas = stats.laplace.rvs(scale=sp.sqrt(h2 / (2 * m)), size=m)
            else:
                M = int(round(m * p))
                betas = sp.concatenate((stats.laplace.rvs(scale=sp.sqrt(h2 / (2 * M)), size=M), sp.zeros(m - M, dtype=float)))
                    
        betas_var = sp.var(betas)
        beta_scalar = sp.sqrt(h2 / (m * betas_var))
        betas = betas * beta_scalar
        betas_list.append(betas)
        phen_noise = stats.norm.rvs(0, sp.sqrt(1.0 - h2), size=n) 
        phen_noise = sp.sqrt((1.0 - h2) / sp.var(phen_noise)) * phen_noise
        genetic_part = sp.dot(snps, betas)
        genetic_part = sp.sqrt(h2 / sp.var(genetic_part)) * genetic_part
        train_phen = genetic_part + phen_noise
        print 'Herit:', sp.var(genetic_part) / sp.var(train_phen)        
        phen_list.append(train_phen)
        betas_marg = (1. / n) * sp.dot(train_phen, snps)
        betas_marg_list.append(betas_marg)

        sys.stdout.write('\b\b\b\b\b\b\b%0.1f%%' % (100.0 * (float(i) / num_traits)))
        sys.stdout.flush()

    if hdf5_file_prefix != None:
        hdf5_file = '%s_p_%0.4f.hdf5' % (hdf5_file_prefix, p)
        if os.path.isfile(hdf5_file):
            print 'File already exists.'
            if overwrite_hdf5:
                print 'Overwriting %s' % hdf5_file
                os.remove(hdf5_file)
            else:
                print 'Attempting to continue.'
            
        h5f = h5py.File(hdf5_file)
        h5f.create_dataset('phenotypes', data=phen_list, compression='gzip')
        h5f.create_dataset('betas', data=betas_list, compression='gzip')
        h5f.create_dataset('betas_marg', data=betas_marg_list, compression='gzip')
    elif hdf5_group != None:
        hdf5_group.create_dataset('phenotypes', data=phen_list, compression='gzip')
        hdf5_group.create_dataset('betas', data=betas_list, compression='gzip')
        hdf5_group.create_dataset('betas_marg', data=betas_marg_list, compression='gzip')        
    else:
        print 'Warning: No storage file given!'
    print '.'
    print "Done simulating data."
    return phen_list
Exemplo n.º 4
0
def simulate_plink_train_test_datasets(num_traits=1, n_sample=1000, p=0.001, m=10000, h2=0.1, adj_r2=0.9, m_ld_chunk_size=100, 
                            effect_prior='gaussian', out_prefix='/Users/bjarnivilhjalmsson/data/tmp/LDpred_data'):
    
    #First simulate SNPs (w LD)
    snps = gt.simulate_genotypes_w_ld(n_sample=n_sample, m=m, conseq_r2=adj_r2, m_ld_chunk_size=m_ld_chunk_size, diploid=True, verbose=True)
    positions = range(m)

    print snps[0], snps[100], snps[200]

    #Simulate traits
    phen_dict = pt.simulate_traits_w_snps(snps, num_traits=num_traits, p=p, m=m, h2=h2, effect_prior=effect_prior, verbose=True, 
                                          liability_thres=None)
    
    #Partition into training and test data
    part_i = int(n_sample/5.0)
    train_snps = snps[:,part_i:]
    test_snps = snps[:,:part_i]
    
    
    #Write out Plink files
    for t_i in range(num_traits):
        train_plink_prefix = '%s_p%0.3f_train_%d'%(out_prefix, p, t_i)
        test_plink_prefix = '%s_p%0.3f_test_%d'%(out_prefix, p, t_i)
        
        
        train_phens = phen_dict['phenotypes'][t_i][part_i:]
        test_phens = phen_dict['phenotypes'][t_i][:part_i]
        write_fake_plink_file(train_snps, train_plink_prefix, positions, phenotypes=train_phens)
        print 'Done w Training file'
        write_fake_plink_file(test_snps, test_plink_prefix, positions, phenotypes=test_phens)
        print 'Done w Testing file'
    

    #Conduct GWAS, and write out results.
    print 'Normalizing genotypes'
    snps_stds = sp.std(train_snps,axis=1)
    snps_means = sp.mean(train_snps,axis=1)
    snps_stds.shape = (len(snps_stds),1)
    snps_means.shape = (len(snps_means),1)
    snps = (train_snps - snps_means)/snps_stds
            
    for t_i in range(num_traits):
        ss_filename = '%s_p%0.3f_ss_%d.txt'%(out_prefix, p, t_i)
        train_phens = phen_dict['phenotypes'][t_i][part_i:]
        #Normalize phenotypes
        n_training = len(train_phens)
        beta_hats = sp.dot(snps, train_phens) / n_training
        b2s = beta_hats ** 2
        f_stats = (n_training - 2) * b2s / (1 - b2s)
        pvals = stats.f.sf(f_stats, 1, n_training - 2)
        print 'Median p-value is %0.3f, and mean p-value is %0.3f'%(sp.median(pvals),sp.mean(pvals))
        
        """
        chr     pos     ref     alt     reffrq  info    rs       pval    effalt
        chr1    1020428 C       T       0.85083 0.98732 rs6687776    0.0587  -0.0100048507289348
        chr1    1020496 G       A       0.85073 0.98751 rs6678318    0.1287  -0.00826075392985992
        """
        with open(ss_filename,'w') as f:
            f.write('chr     pos     ref     alt     reffrq  info    rs       pval    effalt\n')
            i = 0
            for eff, pval in it.izip(beta_hats,pvals):
                f.write('chr1    %d    A    G    0.5    1    sid_%d    %0.6e    %0.6e\n'%(i,i,pval,eff))
                i += 1
Exemplo n.º 5
0
def simulate_traits(n=1000,
                    m=100,
                    hdf5_file_prefix=None,
                    hdf5_group=None,
                    num_traits=1000,
                    h2=0.5,
                    effect_prior='gaussian',
                    p=1.0,
                    conseq_ld=0,
                    overwrite_hdf5=False,
                    test_n=1000,
                    simulate_validation_traits=True):
    """
    Simluate traits:
    First simulate SNPs, then simulate the traits
    
    """

    print "Using %d SNPs to simulate traits for %d individuals." % (m, n)

    genotype_dict = genotypes.simulate_genotypes_w_ld(n=n,
                                                      m=m,
                                                      ld=conseq_ld,
                                                      return_ne=False,
                                                      ld_window_size=0)
    snps = genotype_dict['X']
    betas_list = []
    betas_marg_list = []
    phen_list = []
    for i in range(num_traits):
        if effect_prior == 'gaussian':
            if p == 1.0:
                betas = stats.norm.rvs(0, sp.sqrt(h2 / m), size=m)
            else:
                M = int(round(m * p))
                betas = sp.concatenate(
                    (stats.norm.rvs(0, sp.sqrt(h2 / M),
                                    size=M), sp.zeros(m - M, dtype=float)))
        elif effect_prior == 'laplace':
            if p == 1.0:
                betas = stats.laplace.rvs(scale=sp.sqrt(h2 / (2 * m)), size=m)
            else:
                M = int(round(m * p))
                betas = sp.concatenate(
                    (stats.laplace.rvs(scale=sp.sqrt(h2 / (2 * M)),
                                       size=M), sp.zeros(m - M, dtype=float)))

        betas_var = sp.var(betas)
        beta_scalar = sp.sqrt(h2 / (m * betas_var))
        betas = betas * beta_scalar
        betas_list.append(betas)
        phen_noise = stats.norm.rvs(0, sp.sqrt(1.0 - h2), size=n)
        phen_noise = sp.sqrt((1.0 - h2) / sp.var(phen_noise)) * phen_noise
        genetic_part = sp.dot(snps, betas)
        genetic_part = sp.sqrt(h2 / sp.var(genetic_part)) * genetic_part
        train_phen = genetic_part + phen_noise
        print 'Herit:', sp.var(genetic_part) / sp.var(train_phen)
        phen_list.append(train_phen)
        betas_marg = (1. / n) * sp.dot(train_phen, snps)
        betas_marg_list.append(betas_marg)

        sys.stdout.write('\b\b\b\b\b\b\b%0.1f%%' % (100.0 *
                                                    (float(i) / num_traits)))
        sys.stdout.flush()

    if hdf5_file_prefix != None:
        hdf5_file = '%s_p_%0.4f.hdf5' % (hdf5_file_prefix, p)
        if os.path.isfile(hdf5_file):
            print 'File already exists.'
            if overwrite_hdf5:
                print 'Overwriting %s' % hdf5_file
                os.remove(hdf5_file)
            else:
                print 'Attempting to continue.'

        h5f = h5py.File(hdf5_file)
        h5f.create_dataset('phenotypes', data=phen_list, compression='gzip')
        h5f.create_dataset('betas', data=betas_list, compression='gzip')
        h5f.create_dataset('betas_marg',
                           data=betas_marg_list,
                           compression='gzip')
    elif hdf5_group != None:
        hdf5_group.create_dataset('phenotypes',
                                  data=phen_list,
                                  compression='gzip')
        hdf5_group.create_dataset('betas', data=betas_list, compression='gzip')
        hdf5_group.create_dataset('betas_marg',
                                  data=betas_marg_list,
                                  compression='gzip')
    else:
        print 'Warning: No storage file given!'
    print '.'
    print "Done simulating data."
    return phen_list