def leave_k_out_blup( num_cvs=20, genotype_file='/Users/bjarnivilhjalmsson/data/cegs_lehmann/', k_thres=0.5): """ """ import h5py import hdf5_data import kinship import linear_models as lm import time import scipy as sp from matplotlib import pyplot as plt import analyze_gwas_results as agr phen_dict = hdf5_data.parse_cegs_drosophila_phenotypes() phenotypes = ['Protein', 'Sugar', 'Triglyceride', 'weight'] envs = ['mated', 'virgin'] res_dict = {} for phenotype in phenotypes: env_dict = {} for env in envs: print phenotype, env s1 = time.time() #Load data.. d = hdf5_data.coordinate_cegs_genotype_phenotype(phen_dict, phenotype, env, k_thres=k_thres) Y_means = d['Y_means'] snps = d['snps'] assert sp.all(sp.negative(sp.isnan(snps))), 'WTF?' K = kinship.calc_ibd_kinship(snps) print '\nKinship calculated' assert sp.all(sp.negative(sp.isnan(K))), 'WTF?' n = len(Y_means) #partition genotypes in k parts. gt_ids = d['gt_ids'] num_ids = len(gt_ids) chunk_size = num_ids / num_cvs #Create k CV sets of prediction and validation data cv_chunk_size = int((n / num_cvs) + 1) ordering = sp.random.permutation(n) a = sp.arange(n) osb_ys = [] pred_ys = [] p_herits = [] for cv_i, i in enumerate(range(0, n, cv_chunk_size)): cv_str = 'cv_%d' % cv_i #print 'Working on CV %d' % cv_i end_i = min(n, i + cv_chunk_size) validation_filter = sp.in1d(a, ordering[i:end_i]) training_filter = sp.negative(validation_filter) train_snps = snps[:, training_filter] val_snps = snps[:, validation_filter] train_Y = Y_means[training_filter] val_Y = Y_means[validation_filter] #Calc. kinship K_train = K[training_filter, :][:, training_filter] K_cross = K[validation_filter, :][:, training_filter] #Do gBLUP lmm = lm.LinearMixedModel(train_Y) lmm.add_random_effect(K_train) r1 = lmm.get_REML() #Now the BLUP. y_mean = sp.mean(lmm.Y) Y = lmm.Y - y_mean p_herit = r1['pseudo_heritability'] p_herits.append(p_herit) #delta = (1 - p_herit) / p_herit # if K_inverse == None: # K_inverse = K.I # M = (sp.eye(K.shape[0]) + delta * K_inverse) # u_blup = M.I * Y M = sp.mat(p_herit * sp.mat(K_train) + (1 - p_herit) * sp.eye(K_train.shape[0])) u_mean_pred = sp.array(K_cross * (M.I * Y)).flatten() osb_ys.extend(val_Y) pred_ys.extend(u_mean_pred) corr = sp.corrcoef(osb_ys, pred_ys)[1, 0] print 'Correlation:', corr r2 = corr**2 print 'R2:', r2 mean_herit = sp.mean(p_herits) print 'Avg. heritability:', mean_herit env_dict[env] = { 'R2': r2, 'obs_y': osb_ys, 'pred_y': pred_ys, 'corr': corr, 'avg_herit': mean_herit } res_dict[phenotype] = env_dict res_hdf5_file = '/Users/bjarnivilhjalmsson/data/tmp/leave_%d_BLUP_results_kthres_%0.1f.hdf5' % ( num_cvs, k_thres) h5f = h5py.File(res_hdf5_file) for phenotype in phenotypes: phen_g = h5f.create_group(phenotype) for env in envs: d = res_dict[phenotype][env] env_g = phen_g.create_group(env) env_g.create_dataset('R2', data=[d['R2']]) env_g.create_dataset('corr', data=[d['corr']]) env_g.create_dataset('obs_y', data=d['obs_y']) env_g.create_dataset('pred_y', data=d['pred_y']) env_g.create_dataset('avg_herit', data=[d['avg_herit']]) h5f.close()
def leave_k_out_blup(num_repeats=20, num_cvs=5, genotype_file='/Users/bjarnivilhjalmsson/data/cegs_lehmann/', k_thres=0.5): """ """ import h5py import hdf5_data import kinship import linear_models as lm import time import scipy as sp from matplotlib import pyplot as plt import analyze_gwas_results as agr phen_dict = hdf5_data.parse_cegs_drosophila_phenotypes() phenotypes = ['Protein', 'Sugar', 'Triglyceride', 'weight'] envs = ['mated', 'virgin'] rep_dict = {} for rep_i in range(num_repeats): res_dict = {} for phenotype in phenotypes: env_dict = {} for env in envs: print phenotype, env s1 = time.time() # Load data.. d = hdf5_data.coordinate_cegs_genotype_phenotype( phen_dict, phenotype, env, k_thres=k_thres) Y_means = d['Y_means'] snps = d['snps'] assert sp.all(sp.negative(sp.isnan(snps))), 'WTF?' K = kinship.calc_ibd_kinship(snps) print '\nKinship calculated' assert sp.all(sp.negative(sp.isnan(K))), 'WTF?' n = len(Y_means) # partition genotypes in k parts. gt_ids = d['gt_ids'] num_ids = len(gt_ids) chunk_size = num_ids / num_cvs # Create k CV sets of prediction and validation data cv_chunk_size = int((n / num_cvs) + 1) ordering = sp.random.permutation(n) a = sp.arange(n) osb_ys = [] pred_ys = [] p_herits = [] for cv_i, i in enumerate(range(0, n, cv_chunk_size)): cv_str = 'cv_%d' % cv_i # print 'Working on CV %d' % cv_i end_i = min(n, i + cv_chunk_size) validation_filter = sp.in1d(a, ordering[i:end_i]) training_filter = sp.negative(validation_filter) train_snps = snps[:, training_filter] val_snps = snps[:, validation_filter] train_Y = Y_means[training_filter] val_Y = Y_means[validation_filter] #Calc. kinship K_train = K[training_filter, :][:, training_filter] K_cross = K[validation_filter, :][:, training_filter] # Do gBLUP lmm = lm.LinearMixedModel(train_Y) lmm.add_random_effect(K_train) r1 = lmm.get_REML() # Now the BLUP. y_mean = sp.mean(lmm.Y) Y = lmm.Y - y_mean p_herit = r1['pseudo_heritability'] p_herits.append(p_herit) #delta = (1 - p_herit) / p_herit # if K_inverse == None: # K_inverse = K.I # M = (sp.eye(K.shape[0]) + delta * K_inverse) # u_blup = M.I * Y M = sp.mat(p_herit * sp.mat(K_train) + (1 - p_herit) * sp.eye(K_train.shape[0])) u_mean_pred = sp.array(K_cross * (M.I * Y)).flatten() osb_ys.extend(val_Y) pred_ys.extend(u_mean_pred) corr = sp.corrcoef(osb_ys, pred_ys)[1, 0] print 'Correlation:', corr r2 = corr**2 print 'R2:', r2 mean_herit = sp.mean(p_herits) print 'Avg. heritability:', mean_herit env_dict[env] = {'R2': r2, 'obs_y': osb_ys, 'pred_y': pred_ys, 'corr': corr, 'avg_herit': mean_herit} res_dict[phenotype] = env_dict rep_dict[rep_i] = res_dict res_hdf5_file = '/Users/bjarnivilhjalmsson/data/tmp/leave_%d_BLUP_results_kthres_%0.1f.hdf5' % ( num_cvs, k_thres) h5f = h5py.File(res_hdf5_file) for rep_i in range(num_repeats): res_dict = rep_dict[rep_i] rep_g = h5f.create_group('repl_%d' % rep_i) for phenotype in phenotypes: phen_g = rep_g.create_group(phenotype) for env in envs: d = res_dict[phenotype][env] env_g = phen_g.create_group(env) env_g.create_dataset('R2', data=[d['R2']]) env_g.create_dataset('corr', data=[d['corr']]) env_g.create_dataset('obs_y', data=d['obs_y']) env_g.create_dataset('pred_y', data=d['pred_y']) env_g.create_dataset('avg_herit', data=[d['avg_herit']]) h5f.close()
def perform_cegs_gwas(kinship_type='ibd', phen_type='medians'): """ Perform a simple MLM GWAS for the 8 traits """ import hdf5_data import kinship import linear_models as lm import time import scipy as sp from matplotlib import pyplot as plt import analyze_gwas_results as agr phen_dict = hdf5_data.parse_cegs_drosophila_phenotypes() phenotypes = ['Protein', 'Sugar', 'Triglyceride', 'weight'] envs = ['mated', 'virgin'] for phenotype in phenotypes: for env in envs: print phenotype, env s1 = time.time() d = hdf5_data.coordinate_cegs_genotype_phenotype( phen_dict, phenotype, env) print 'Calculating kinship' if kinship_type == 'ibs': K = kinship.calc_ibs_kinship(d['snps']) elif kinship_type == 'ibd': K = kinship.calc_ibd_kinship(d['snps']) else: raise NotImplemented if phen_type == 'means': lmm = lm.LinearMixedModel(d['Y_means']) elif phen_type == 'medians': lmm = lm.LinearMixedModel(d['Y_medians']) else: raise NotImplementedError lmm.add_random_effect(K) print "Running EMMAX" res = lmm.emmax_f_test(d['snps'], emma_num=1000) print 'Mean p-value:', sp.mean(res['ps']) secs = time.time() - s1 if secs > 60: mins = int(secs) / 60 secs = secs - mins * 60 print 'Took %d mins and %f seconds.' % (mins, secs) else: print 'Took %f seconds.' % (secs) #Now generating QQ-plots label_str = '%s_%s_%s_%s' % (kinship_type, phenotype, env, phen_type) agr.plot_simple_qqplots_pvals( '/Users/bjarnivilhjalmsson/data/tmp/cegs_qq_%s' % (label_str), [res['ps']], result_labels=[label_str], line_colors=['green'], num_dots=1000, title=None, max_neg_log_val=6) # Perform multiple loci mixed model GWAS chromosomes = d['positions'][:, 0] positions = sp.array(d['positions'][:, 1], 'int32') x_positions = [] y_log_pvals = [] colors = [] x_shift = 0 for i, chrom in enumerate(sp.unique(chromosomes)): if chrom in ['2L', '2LHet', '3L', '3LHet', '4', 'X', 'XHet']: colors.append('c') else: # chrom in ['2R', '2RHet', '3R', '3RHet', 'U', 'Uextra'] #Toss U and Hets colors.append('m') chrom_filter = sp.in1d(chromosomes, chrom) positions_slice = positions[chrom_filter] x_positions.append(positions_slice + x_shift) x_shift += positions_slice.max() log_ps_slice = -sp.log10(res['ps'][chrom_filter]) y_log_pvals.append(log_ps_slice) m = len(positions) log_bonf = -sp.log10(1 / (20.0 * m)) print m, log_bonf # Plot manhattan plots? plt.figure(figsize=(12, 4)) plt.axes([0.03, 0.1, 0.95, 0.8]) for i, chrom in enumerate(sp.unique(chromosomes)): plt.plot(x_positions[i], y_log_pvals[i], c=colors[i], ls='', marker='.') xmin, xmax = plt.xlim() plt.hlines(log_bonf, xmin, xmax, colors='k', linestyles='--', alpha=0.5) plt.title('%s, %s' % (phenotype, env)) plt.savefig( '/Users/bjarnivilhjalmsson/data/tmp/cegs_gwas_%s_%s_%s_%s.png' % (kinship_type, phenotype, env, phen_type))
def perform_cegs_gwas(kinship_type='ibd', phen_type='medians'): """ Perform a simple MLM GWAS for the 8 traits """ import hdf5_data import kinship import linear_models as lm import time import scipy as sp from matplotlib import pyplot as plt import analyze_gwas_results as agr phen_dict = hdf5_data.parse_cegs_drosophila_phenotypes() phenotypes = ['Protein', 'Sugar', 'Triglyceride', 'weight'] envs = ['mated', 'virgin'] for phenotype in phenotypes: for env in envs: print phenotype, env s1 = time.time() d = hdf5_data.coordinate_cegs_genotype_phenotype( phen_dict, phenotype, env) print 'Calculating kinship' if kinship_type == 'ibs': K = kinship.calc_ibs_kinship(d['snps']) elif kinship_type == 'ibd': K = kinship.calc_ibd_kinship(d['snps']) else: raise NotImplementedError if phen_type == 'means': lmm = lm.LinearMixedModel(d['Y_means']) elif phen_type == 'medians': lmm = lm.LinearMixedModel(d['Y_medians']) else: raise NotImplementedError lmm.add_random_effect(K) print "Running EMMAX" res = lmm.emmax_f_test(d['snps'], emma_num=1000) print 'Mean p-value:', sp.mean(res['ps']) secs = time.time() - s1 if secs > 60: mins = int(secs) / 60 secs = secs - mins * 60 print 'Took %d mins and %f seconds.' % (mins, secs) else: print 'Took %f seconds.' % (secs) # Now generating QQ-plots label_str = '%s_%s_%s_%s' % ( kinship_type, phenotype, env, phen_type) agr.plot_simple_qqplots_pvals('/Users/bjarnivilhjalmsson/data/tmp/cegs_qq_%s' % (label_str), [res['ps']], result_labels=[ label_str], line_colors=['green'], num_dots=1000, title=None, max_neg_log_val=6) # Perform multiple loci mixed model GWAS chromosomes = d['positions'][:, 0] positions = sp.array(d['positions'][:, 1], 'int32') x_positions = [] y_log_pvals = [] colors = [] x_shift = 0 for i, chrom in enumerate(sp.unique(chromosomes)): if chrom in ['2L', '2LHet', '3L', '3LHet', '4', 'X', 'XHet']: colors.append('c') else: # chrom in ['2R', '2RHet', '3R', '3RHet', 'U', 'Uextra'] # Toss U and Hets colors.append('m') chrom_filter = sp.in1d(chromosomes, chrom) positions_slice = positions[chrom_filter] x_positions.append(positions_slice + x_shift) x_shift += positions_slice.max() log_ps_slice = -sp.log10(res['ps'][chrom_filter]) y_log_pvals.append(log_ps_slice) m = len(positions) log_bonf = -sp.log10(1 / (20.0 * m)) print m, log_bonf # Plot manhattan plots? plt.figure(figsize=(12, 4)) plt.axes([0.03, 0.1, 0.95, 0.8]) for i, chrom in enumerate(sp.unique(chromosomes)): plt.plot(x_positions[i], y_log_pvals[i], c=colors[i], ls='', marker='.') xmin, xmax = plt.xlim() plt.hlines(log_bonf, xmin, xmax, colors='k', linestyles='--', alpha=0.5) plt.title('%s, %s' % (phenotype, env)) plt.savefig('/Users/bjarnivilhjalmsson/data/tmp/cegs_gwas_%s_%s_%s_%s.png' % (kinship_type, phenotype, env, phen_type))