예제 #1
0
파일: mvp.py 프로젝트: bvilhjal/phensim
def _test_():
    singleton_snps = genotypes.simulate_k_tons(n=500, m=1000)
    doubleton_snps = genotypes.simulate_k_tons(k=2, n=500, m=1000)
    common_snps = genotypes.simulate_common_genotypes(500, 1000) 
    
    snps = sp.vstack([common_snps, singleton_snps, doubleton_snps])
    print snps
    snps = snps.T
    snps = (snps - sp.mean(snps, 0)) / sp.std(snps, 0)
    snps = snps.T
    print snps, snps.shape
    file_prefix = os.environ['HOME'] + '/tmp/test'
    phen_list = phenotypes.simulate_traits_w_snps_to_hdf5(snps, hdf5_file_prefix=file_prefix,
                                           num_traits=30, p=0.1)
    
    singletons_thres = []
    doubletons_thres = []
    common_thres = []
    for i, y in enumerate(phen_list['phenotypes']):
        
        K = kinship.calc_ibd_kinship(snps)
        K = kinship.scale_k(K)
        lmm = lm.LinearMixedModel(y)
        lmm.add_random_effect(K)
        r1 = lmm.get_REML()
        print 'pseudo_heritability:', r1['pseudo_heritability']

        ex_res = lm.emmax(snps, y, K)
        plt.figure()
        plt.hist(y, 50)
        plt.savefig('%s_%d_phen.png' % (file_prefix, i))
        plt.clf()
        
        
        agr.plot_simple_qqplots_pvals('%s_%d' % (file_prefix, i),
                                      [ex_res['ps'][:1000], ex_res['ps'][1000:2000], ex_res['ps'][2000:]],
                                      result_labels=['Common SNPs', 'Singletons', 'Doubletons'],
                                      line_colors=['b', 'r', 'y'],
                                      num_dots=200, max_neg_log_val=3)
        
        # Cholesky permutations..
        res = lm.emmax_perm_test(singleton_snps, y, K, num_perm=1000)
        print 1.0 / (20 * 1000.0), res['threshold_05']
        singletons_thres.append(res['threshold_05'][0])
        res = lm.emmax_perm_test(doubleton_snps, y, K, num_perm=1000)
        print 1.0 / (20 * 1000.0), res['threshold_05']
        doubletons_thres.append(res['threshold_05'][0])
        res = lm.emmax_perm_test(common_snps, y, K, num_perm=1000)
        print 1.0 / (20 * 1000.0), res['threshold_05']
        common_thres.append(res['threshold_05'][0])
        
        #ATT permutations (Implement)
        
        #PC permutations (Implement)
        

    print sp.mean(singletons_thres), sp.std(singletons_thres)
    print sp.mean(doubletons_thres), sp.std(doubletons_thres)
    print sp.mean(common_thres), sp.std(common_thres)
예제 #2
0
파일: mvp.py 프로젝트: theboocock/phensim
def _test_scz_():
    # Load Schizophrenia data

    singleton_snps = genotypes.simulate_k_tons(n=500, m=1000)
    doubleton_snps = genotypes.simulate_k_tons(k=2, n=500, m=1000)
    common_snps = genotypes.simulate_common_genotypes(500, 1000)

    snps = sp.vstack([common_snps, singleton_snps, doubleton_snps])
    test_snps = sp.vstack([singleton_snps, doubleton_snps])
    print snps
    phen_list = phenotypes.simulate_traits(
        snps, hdf5_file_prefix='/home/bv25/tmp/test', num_traits=30, p=1.0)

    singletons_thres = []
    doubletons_thres = []
    common_thres = []
    for i, y in enumerate(phen_list):

        K = kinship.calc_ibd_kinship(snps)
        K = kinship.scale_k(K)
        lmm = lm.LinearMixedModel(y)
        lmm.add_random_effect(K)
        r1 = lmm.get_REML()
        print 'pseudo_heritability:', r1['pseudo_heritability']

        ex_res = lm.emmax(snps, y, K)
        plt.figure()
        plt.hist(y, 50)
        plt.savefig('/home/bv25/tmp/test_%d_phen.png' % i)
        plt.clf()
        agr.plot_simple_qqplots_pvals('/home/bv25/tmp/test_%d' % i, [
            ex_res['ps'][:1000], ex_res['ps'][1000:2000], ex_res['ps'][2000:]
        ],
                                      result_labels=[
                                          'Common SNPs', 'Singletons',
                                          'Doubletons'
                                      ],
                                      line_colors=['b', 'r', 'y'],
                                      num_dots=200,
                                      max_neg_log_val=3)

        # Now permutations..
        res = lm.emmax_perm_test(singleton_snps, y, K, num_perm=1000)
        print 1.0 / (20 * 1000.0), res['threshold_05']
        singletons_thres.append(res['threshold_05'][0])
        res = lm.emmax_perm_test(doubleton_snps, y, K, num_perm=1000)
        print 1.0 / (20 * 1000.0), res['threshold_05']
        doubletons_thres.append(res['threshold_05'][0])
        res = lm.emmax_perm_test(common_snps, y, K, num_perm=1000)
        print 1.0 / (20 * 1000.0), res['threshold_05']
        common_thres.append(res['threshold_05'][0])
    print sp.mean(singletons_thres), sp.std(singletons_thres)
    print sp.mean(doubletons_thres), sp.std(doubletons_thres)
    print sp.mean(common_thres), sp.std(common_thres)
예제 #3
0
파일: mvp.py 프로젝트: bvilhjal/phensim
def _test_scz_():
    # Load Schizophrenia data
    
    singleton_snps = genotypes.simulate_k_tons(n=500, m=1000)
    doubleton_snps = genotypes.simulate_k_tons(k=2, n=500, m=1000)
    common_snps = genotypes.simulate_common_genotypes(500, 1000) 
    
    snps = sp.vstack([common_snps, singleton_snps, doubleton_snps])
    test_snps = sp.vstack([singleton_snps, doubleton_snps])
    print snps
    phen_list = phenotypes.simulate_traits(snps, hdf5_file_prefix='/home/bv25/tmp/test', num_traits=30, p=1.0)
    
    singletons_thres = []
    doubletons_thres = []
    common_thres = []
    for i, y in enumerate(phen_list):
        
        K = kinship.calc_ibd_kinship(snps)
        K = kinship.scale_k(K)
        lmm = lm.LinearMixedModel(y)
        lmm.add_random_effect(K)
        r1 = lmm.get_REML()
        print 'pseudo_heritability:', r1['pseudo_heritability']

        ex_res = lm.emmax(snps, y, K)
        plt.figure()
        plt.hist(y, 50)
        plt.savefig('/home/bv25/tmp/test_%d_phen.png' % i)
        plt.clf()
        agr.plot_simple_qqplots_pvals('/home/bv25/tmp/test_%d' % i,
                                      [ex_res['ps'][:1000], ex_res['ps'][1000:2000], ex_res['ps'][2000:]],
                                      result_labels=['Common SNPs', 'Singletons', 'Doubletons'],
                                      line_colors=['b', 'r', 'y'],
                                      num_dots=200, max_neg_log_val=3)
        
        # Now permutations..
        res = lm.emmax_perm_test(singleton_snps, y, K, num_perm=1000)
        print 1.0 / (20 * 1000.0), res['threshold_05']
        singletons_thres.append(res['threshold_05'][0])
        res = lm.emmax_perm_test(doubleton_snps, y, K, num_perm=1000)
        print 1.0 / (20 * 1000.0), res['threshold_05']
        doubletons_thres.append(res['threshold_05'][0])
        res = lm.emmax_perm_test(common_snps, y, K, num_perm=1000)
        print 1.0 / (20 * 1000.0), res['threshold_05']
        common_thres.append(res['threshold_05'][0])
    print sp.mean(singletons_thres), sp.std(singletons_thres)
    print sp.mean(doubletons_thres), sp.std(doubletons_thres)
    print sp.mean(common_thres), sp.std(common_thres)
예제 #4
0
def map_phenotype(p_i, phed, mapping_method, trans_method, p_dict):
    import copy
    phed = copy.deepcopy(phed)
    phenotype_name = phed.get_name(p_i)
    phen_is_binary = phed.is_binary(p_i)
    if trans_method == 'most_normal':
        trans_method, shapiro_pval = phed.most_normal_transformation(p_i, perform_trans=False)
    file_prefix = _get_file_prefix_(p_dict['run_id'], p_i, phed.get_name(p_i),
                mapping_method, trans_method, p_dict['remove_outliers'], p_dict['with_replicates'],
                p_dict['call_method_id'])
    result_name = "%s_%s_%s" % (phenotype_name, mapping_method, trans_method)
    emmax_perm_threshold = None
    k = None

    res = None
    #Check whether result already exists.
    if p_dict['use_existing_results']:
        if p_dict['region_plots']:
            sd = _get_genotype_data_(p_dict)
            num_outliers = prepare_data(sd, phed, p_i, trans_method, p_dict['remove_outliers'], p_dict['with_replicates'])
            if p_dict['remove_outliers']:
                assert num_outliers != 0, "No outliers were removed, so it makes no sense to go on and perform GWA."

            snps = sd.getSnps()
        else:
            snps = None

        print "\nChecking for existing results."
        result_file = file_prefix + ".pvals"
        if os.path.isfile(result_file):
            res = gwaResults.Result(result_file=result_file, name=result_name, snps=snps)
            pvals = True
        else:
            result_file = file_prefix + ".scores"
            if os.path.isfile(result_file):
                res = gwaResults.Result(result_file=result_file, name=result_name, snps=snps)
                pvals = False
        if res:
            print "Found existing results.. (%s)" % (result_file)

        sys.stdout.flush()


    #Loading candidate genes
    cand_genes = None
    if p_dict['cand_genes_file']:
        cand_genes, tair_ids = gwaResults.load_cand_genes_file(p_dict['cand_genes_file'])
    else:
        cand_genes = None
        tair_ids = None

    if not res: #If results weren't found in a file... then do GWA.
        #Loading data
        sd = _get_genotype_data_(p_dict)
        num_outliers, n_filtered_snps = prepare_data(sd, phed, p_i, trans_method, p_dict['remove_outliers'],
                                                     p_dict['with_replicates'])

        #Do we need to calculate the K-matrix?
        if mapping_method in ['emma', 'emmax', 'emmax_anova', 'emmax_step', 'loc_glob_mm']:
            #Load genotype file (in binary format)
            sys.stdout.write("Retrieving the Kinship matrix K.\n")
            sys.stdout.flush()
            if p_dict['kinship_file']:   #Kinship file was supplied..
                print 'Loading supplied kinship file: %s' % p_dict['kinship_file']
                k = kinship.load_kinship_from_file(p_dict['kinship_file'], sd.accessions)
            else:
                print 'Loading kinship file.'
                if p_dict['data_file'] != None:
                    if p_dict['kinship_type'] == 'ibs':
                        k = sd.get_ibs_kinship_matrix()
                    elif p_dict['kinship_type'] == 'ibd':
                        k = sd.get_ibd_kinship_matrix()
                else:
                    k = kinship.get_kinship(call_method_id=p_dict['call_method_id'], data_format=p_dict['data_format'],
                                            method=p_dict['kinship_type'], n_removed_snps=n_filtered_snps,
                                            remain_accessions=sd.accessions)
            sys.stdout.flush()
            sys.stdout.write("Done!\n")

        if p_dict['remove_outliers']:
            if num_outliers == 0: print "No outliers were removed!"

        phen_vals = phed.get_values(p_i)

        if p_dict['local_gwas']: #Filter SNPs, etc..
            sd = snpsdata.SNPsDataSet([sd.get_region_snpsd(*p_dict['local_gwas'])],
                        [p_dict['local_gwas'][0]], data_format=sd.data_format)
        snps = sd.getSnps()


        sys.stdout.write("Finished loading and handling data!\n")

        print "Plotting a histogram"
        p_her = None
        hist_file_prefix = _get_file_prefix_(p_dict['run_id'], p_i, phenotype_name, trans_method,
                        p_dict['remove_outliers'], p_dict['with_replicates'],
                        p_dict['call_method_id'])
        hist_png_file = hist_file_prefix + "_hist.png"
        if k is not None:
            p_her = phed.get_pseudo_heritability(p_i, k)['pseudo_heritability']
            p_her_pval = phed.get_pseudo_heritability(p_i, k)['pval']
            phed.plot_histogram(p_i, png_file=hist_png_file, p_her=p_her, p_her_pval=p_her_pval)
        else:
            phed.plot_histogram(p_i, png_file=hist_png_file)


        print "Applying %s to data." % (mapping_method)
        sys.stdout.flush()
        kwargs = {}
        additional_columns = []
        if "kw" == mapping_method:

            if phen_is_binary:
                warnings.warn("Warning, applying KW to a binary phenotype")

            kw_res = util.kruskal_wallis(snps, phen_vals)
            pvals = kw_res['ps']
            kwargs['statistics'] = kw_res['ds']
            additional_columns.append('statistics')


        elif "ft" == mapping_method:
            raise NotImplementedError
#            pvals, or_est = run_fet(snps, phen_vals)
#            kwargs['odds_ratio_est'] = or_est
#            additional_columns.append('odds_ratio_est')

        else:  #Parametric tests below:        

            if mapping_method in ['emma', 'emmax', 'emmax_perm', 'emmax_step', 'emmax_anova', 'loc_glob_mm']:
                r = lm.mm_lrt_test(phen_vals, k)
                if r['pval'] > 0.05:
                    print "Performing EMMA, even though a mixed model does not fit the data significantly better"
                    print 'p-value: %0.3f' % r['pval']
                else:
                    print 'The mixed model fits the data significantly better than the simple linear model.'
                    print 'p-value: %f' % r['pval']

            if mapping_method in ['loc_glob_mm']:
                res_dict = lm.local_vs_global_mm_scan(phen_vals, sd, file_prefix=file_prefix,
                            global_k=k, window_size=p_dict['loc_glob_ws'],
                            jump_size=p_dict['loc_glob_ws'] / 2,
                            kinship_method=p_dict['kinship_type'])
                res_file_name = file_prefix + '.csv'
                _write_res_dict_to_file_(res_file_name, res_dict)
                return
            elif mapping_method in ['emma']:
                res = lm.emma(snps, phen_vals, k)
            elif mapping_method in ['emmax']:
                if p_dict['emmax_perm']:
                    perm_sd = _get_genotype_data_(p_dict)
                    num_outliers = prepare_data(perm_sd, phed, p_i, 'none', 0, p_dict['with_replicates'])
                    perm_sd.filter_mac_snps(p_dict['mac_threshold'])
                    t_snps = perm_sd.getSnps()
                    t_phen_vals = phed.get_values(p_i)
                    res = lm.emmax_perm_test(t_snps, t_phen_vals, k, p_dict['emmax_perm'])
                    emmax_perm_threshold = res['threshold_05'][0]
                    import pylab
                    hist_res = pylab.hist(-sp.log10(res['min_ps']), alpha=0.6)
                    threshold = -sp.log10(emmax_perm_threshold)
                    b_threshold = -sp.log10(1.0 / (len(t_snps) * 20.0))
                    pylab.vlines(threshold, 0, max(hist_res[0]), color='g')
                    pylab.vlines(b_threshold, 0, max(hist_res[0]), color='r')
                    pylab.savefig(file_prefix + 'perm_%d_min_pval_hist.png' % (p_dict['emmax_perm']),
                        format='png')
                if p_dict['with_replicates']:
                    #Get values, with ecotypes, construct Z and do GWAM
                    phen_vals = phed.get_values(p_i)
                    Z = phed.get_incidence_matrix(p_i)
                    res = lm.emmax(snps, phen_vals, k, Z=Z, with_betas=p_dict['with_betas'],
                            emma_num=p_dict['emmax_emma_num'])
                else:
                    res = lm.emmax(snps, phen_vals, k, with_betas=p_dict['with_betas'],
                            emma_num=p_dict['emmax_emma_num'])

            elif mapping_method in ['emmax_step']:
                sd.filter_mac_snps(p_dict['mac_threshold'])
                local = False
                if p_dict['local_gwas']:
                    local = True
                    file_prefix += '_' + '_'.join(map(str, p_dict['local_gwas']))
                res = lm.emmax_step_wise(phen_vals, k, sd=sd, num_steps=p_dict['num_steps'],
                            file_prefix=file_prefix, local=local, cand_gene_list=cand_genes,
                            save_pvals=p_dict['save_stepw_pvals'],
                            emma_num=p_dict['emmax_emma_num'])
                print 'Step-wise EMMAX finished!'
                return
            elif mapping_method in ['lm_step']:
                sd.filter_mac_snps(p_dict['mac_threshold'])
                local = False
                if p_dict['local_gwas']:
                    local = True
                    file_prefix += '_' + '_'.join(map(str, p_dict['local_gwas']))
                res = lm.lm_step_wise(phen_vals, sd=sd, num_steps=p_dict['num_steps'],
                            file_prefix=file_prefix, local=local, cand_gene_list=cand_genes,
                            save_pvals=p_dict['save_stepw_pvals'])
                print 'Step-wise LM finished!'
                return
            elif mapping_method in ['lm']:
                res = lm.linear_model(snps, phen_vals)
            elif mapping_method in ['emmax_anova']:
                res = lm.emmax_anova(snps, phen_vals, k)
            elif mapping_method in ['lm_anova']:
                res = lm.anova(snps, phen_vals)
            else:
                print "Mapping method", mapping_method, 'was not found.'
                return

            if mapping_method in ['lm', 'emma', 'emmax']:
                kwargs['genotype_var_perc'] = res['var_perc']
                additional_columns.append('genotype_var_perc')
                if p_dict['with_betas'] or mapping_method in ['emma' ]:
                    betas = map(list, zip(*res['betas']))
                    kwargs['beta0'] = betas[0]
                    additional_columns.append('beta0')
                    if len(betas) > 1:
                        kwargs['beta1'] = betas[1]
                        additional_columns.append('beta1')
                pvals = res['ps']
                sys.stdout.write("Done!\n")
                sys.stdout.flush()

            if mapping_method in ['lm_anova', 'emmax_anova']:
                kwargs['genotype_var_perc'] = res['var_perc']
                pvals = res['ps']
                sys.stdout.write("Done!\n")
                sys.stdout.flush()


#        print 'Calculating SNP-phenotype correlations.'
#        kwargs['correlations'] = calc_correlations(snps, phen_vals)
#        additional_columns.append('correlations')
        print 'Writing result to file.'
        res = gwaResults.Result(scores=pvals.tolist(), snps_data=sd, name=result_name, **kwargs)
        if mapping_method in ["kw", "ft", "emma", 'lm', "emmax", 'emmax_anova', 'lm_anova']:
            result_file = file_prefix + ".pvals"
        else:
            result_file = file_prefix + ".scores"
        res.write_to_file(result_file, additional_columns, max_fraction=p_dict['pvalue_filter'])

    #add results to DB..

    if p_dict['add_to_db']:
        print 'Adding results to DB.'
        if p_dict['with_db_ids']:
            db_pid = p_i
        else:
            db_pid = phed.get_db_pid(p_i)

        import results_2_db as rdb

        short_name = 'cm%d_pid%d_%s_%s_%s_%d_%s' % (p_dict['call_method_id'], db_pid, phenotype_name,
                            mapping_method, trans_method, p_dict['remove_outliers'],
                            str(p_dict['with_replicates']))
        tm_id = transformation_method_dict[trans_method]
        try:
            rdb.add_results_to_db(result_file, short_name, p_dict['call_method_id'], db_pid,
                        analysis_methods_dict[mapping_method],
                        tm_id, remove_outliers=p_dict['remove_outliers'])
        except Exception, err_str:
            print 'Failed inserting results into DB!'
            print err_str
예제 #5
0
파일: mvp.py 프로젝트: theboocock/phensim
def _test_():
    singleton_snps = genotypes.simulate_k_tons(n=500, m=1000)
    doubleton_snps = genotypes.simulate_k_tons(k=2, n=500, m=1000)
    common_snps = genotypes.simulate_common_genotypes(500, 1000)

    snps = sp.vstack([common_snps, singleton_snps, doubleton_snps])
    print snps
    snps = snps.T
    snps = (snps - sp.mean(snps, 0)) / sp.std(snps, 0)
    snps = snps.T
    print snps, snps.shape
    file_prefix = os.environ['HOME'] + '/tmp/test'
    phen_list = phenotypes.simulate_traits_w_snps_to_hdf5(
        snps, hdf5_file_prefix=file_prefix, num_traits=30, p=0.1)

    singletons_thres = []
    doubletons_thres = []
    common_thres = []
    for i, y in enumerate(phen_list['phenotypes']):

        K = kinship.calc_ibd_kinship(snps)
        K = kinship.scale_k(K)
        lmm = lm.LinearMixedModel(y)
        lmm.add_random_effect(K)
        r1 = lmm.get_REML()
        print 'pseudo_heritability:', r1['pseudo_heritability']

        ex_res = lm.emmax(snps, y, K)
        plt.figure()
        plt.hist(y, 50)
        plt.savefig('%s_%d_phen.png' % (file_prefix, i))
        plt.clf()

        agr.plot_simple_qqplots_pvals('%s_%d' % (file_prefix, i), [
            ex_res['ps'][:1000], ex_res['ps'][1000:2000], ex_res['ps'][2000:]
        ],
                                      result_labels=[
                                          'Common SNPs', 'Singletons',
                                          'Doubletons'
                                      ],
                                      line_colors=['b', 'r', 'y'],
                                      num_dots=200,
                                      max_neg_log_val=3)

        # Cholesky permutations..
        res = lm.emmax_perm_test(singleton_snps, y, K, num_perm=1000)
        print 1.0 / (20 * 1000.0), res['threshold_05']
        singletons_thres.append(res['threshold_05'][0])
        res = lm.emmax_perm_test(doubleton_snps, y, K, num_perm=1000)
        print 1.0 / (20 * 1000.0), res['threshold_05']
        doubletons_thres.append(res['threshold_05'][0])
        res = lm.emmax_perm_test(common_snps, y, K, num_perm=1000)
        print 1.0 / (20 * 1000.0), res['threshold_05']
        common_thres.append(res['threshold_05'][0])

        #ATT permutations (Implement)

        #PC permutations (Implement)

    print sp.mean(singletons_thres), sp.std(singletons_thres)
    print sp.mean(doubletons_thres), sp.std(doubletons_thres)
    print sp.mean(common_thres), sp.std(common_thres)