def main(): if len(sys.argv)==1: print ('ERROR: No options provided.\n') parser.print_help(sys.stderr) sys.exit(1) parameters = parser.parse_args() p_dict= vars(parameters) if p_dict['debug']: print ('Parsed parameters:') print(p_dict) action = p_dict['ldpred_action'] if action=='coord': coord_genotypes.main(p_dict) elif action=='gibbs': LDpred_gibbs.main(p_dict) elif action=='inf': LDpred_inf.main(p_dict) elif action=='p+t': LD_pruning_thres.main(p_dict) elif action=='score': validate.main(p_dict) elif action=='ldfile': ld.get_ld_dict(p_dict['cf'], p_dict['ldf'], p_dict['ldr'],wallaceld=True) elif action=='all': pass
def main_with_args(args): print(title_string) if len(args) < 1: parser.print_usage() print(description_string) return parameters = parser.parse_args(args) p_dict = vars(parameters) if p_dict['debug']: print('Parsed parameters:') print(p_dict) action = p_dict['ldpred_action'] if action == 'coord': coord_genotypes.main(p_dict) elif action == 'gibbs': LDpred_gibbs.main(p_dict) elif action == 'inf': LDpred_inf.main(p_dict) elif action == 'p+t': LD_pruning_thres.main(p_dict) elif action == 'score': validate.main(p_dict) elif action == 'all': pass else: parser.print_help(sys.stderr) sys.exit(1)
def main(): parameters = parser.parse_args() p_dict = vars(parameters) if p_dict['debug']: print('Parsed parameters:') print(p_dict) action = p_dict['ldpred_action'] if action == 'coord': coord_genotypes.main(p_dict) elif action == 'gibbs': LDpred_gibbs.main(p_dict) elif action == 'inf': LDpred_inf.main(p_dict) elif action == 'p+t': LD_pruning_thres.main(p_dict) elif action == 'score': validate.main(p_dict) elif action == 'all': pass
def main(): print(title_string) if len(sys.argv)==1: parser.print_help(sys.stderr) sys.exit(1) parameters = parser.parse_args() p_dict= vars(parameters) if p_dict['debug']: print ('Parsed parameters:') print(p_dict) action = p_dict['ldpred_action'] if action=='coord': coord_genotypes.main(p_dict) elif action=='gibbs': LDpred_gibbs.main(p_dict) elif action=='inf': LDpred_inf.main(p_dict) elif action=='p+t': LD_pruning_thres.main(p_dict) elif action=='score': validate.main(p_dict) elif action=='all': pass
def main(): print(title_string) if len(sys.argv) == 1: parser.print_help(sys.stderr) sys.exit(1) parameters = parser.parse_args() p_dict = vars(parameters) if p_dict['debug']: print('Parsed parameters:') print(p_dict) action = p_dict['ldpred_action'] if action == 'coord': coord_genotypes.main(p_dict) elif action == 'gibbs': LDpred_gibbs.main(p_dict) elif action == 'inf': LDpred_inf.main(p_dict) elif action == 'p+t': LD_pruning_thres.main(p_dict) elif action == 'score': validate.main(p_dict) elif action == 'all': pass
def ldpred_genomewide(data_file=None, ld_radius=None, ld_dict=None, out_file_prefix=None, summary_dict=None, ps=None, n=None, h2=None, num_iter=None, verbose=False, zero_jump_prob=0.05, burn_in=5): """ Calculate LDpred for a genome """ df = h5py.File(data_file, 'r') has_phenotypes = False if 'y' in df: 'Validation phenotypes found.' y = df['y'][...] # Phenotype num_individs = len(y) risk_scores_pval_derived = sp.zeros(num_individs) has_phenotypes = True ld_scores_dict = ld_dict['ld_scores_dict'] chrom_ld_dict = ld_dict['chrom_ld_dict'] chrom_ref_ld_mats = ld_dict['chrom_ref_ld_mats'] print('Applying LDpred with LD radius: %d' % ld_radius) results_dict = {} cord_data_g = df['cord_data'] #Calculating genome-wide heritability using LD score regression, and partition heritability by chromsomes herit_dict = ld.get_chromosome_herits(cord_data_g, ld_scores_dict, n, h2=h2, debug=verbose, summary_dict=summary_dict) LDpred_inf_chrom_dict = {} print('Calculating LDpred-inf weights') for chrom_str in util.chromosomes_list: if chrom_str in cord_data_g: print('Calculating SNP weights for Chromosome %s' % ((chrom_str.split('_'))[1])) g = cord_data_g[chrom_str] # Filter monomorphic SNPs snp_stds = g['snp_stds_ref'][...] snp_stds = snp_stds.flatten() ok_snps_filter = snp_stds > 0 pval_derived_betas = g['betas'][...] pval_derived_betas = pval_derived_betas[ok_snps_filter] h2_chrom = herit_dict[chrom_str] start_betas = LDpred_inf.ldpred_inf( pval_derived_betas, genotypes=None, reference_ld_mats=chrom_ref_ld_mats[chrom_str], h2=h2_chrom, n=n, ld_window_size=2 * ld_radius, verbose=False) LDpred_inf_chrom_dict[chrom_str] = start_betas convergence_report = {} for p in ps: convergence_report[p] = False print('Starting LDpred gibbs with f=%0.4f' % p) p_str = '%0.4f' % p results_dict[p_str] = {} if out_file_prefix: # Preparing output files raw_effect_sizes = [] ldpred_effect_sizes = [] ldpred_inf_effect_sizes = [] out_sids = [] chromosomes = [] out_positions = [] out_nts = [] chrom_i = 0 num_chrom = len(util.chromosomes_list) for chrom_str in util.chromosomes_list: chrom_i += 1 if chrom_str in cord_data_g: g = cord_data_g[chrom_str] if verbose and has_phenotypes: if 'raw_snps_val' in g: raw_snps = g['raw_snps_val'][...] else: raw_snps = g['raw_snps_ref'][...] # Filter monomorphic SNPs snp_stds = g['snp_stds_ref'][...] snp_stds = snp_stds.flatten() pval_derived_betas = g['betas'][...] positions = g['positions'][...] sids = (g['sids'][...]).astype(util.sids_u_dtype) log_odds = g['log_odds'][...] nts = (g['nts'][...]).astype(util.nts_u_dtype) ok_snps_filter = snp_stds > 0 if not sp.all(ok_snps_filter): snp_stds = snp_stds[ok_snps_filter] pval_derived_betas = pval_derived_betas[ok_snps_filter] positions = positions[ok_snps_filter] sids = sids[ok_snps_filter] log_odds = log_odds[ok_snps_filter] nts = nts[ok_snps_filter] if verbose and has_phenotypes: raw_snps = raw_snps[ok_snps_filter] if out_file_prefix: chromosomes.extend([chrom_str] * len(pval_derived_betas)) out_positions.extend(positions) out_sids.extend(sids) raw_effect_sizes.extend(log_odds) out_nts.extend(nts) h2_chrom = herit_dict[chrom_str] if 'chrom_ld_boundaries' in ld_dict: ld_boundaries = ld_dict['chrom_ld_boundaries'][chrom_str] res_dict = ldpred_gibbs( pval_derived_betas, h2=h2_chrom, n=n, p=p, ld_radius=ld_radius, verbose=verbose, num_iter=num_iter, burn_in=burn_in, ld_dict=chrom_ld_dict[chrom_str], start_betas=LDpred_inf_chrom_dict[chrom_str], ld_boundaries=ld_boundaries, zero_jump_prob=zero_jump_prob) else: res_dict = ldpred_gibbs( pval_derived_betas, h2=h2_chrom, n=n, p=p, ld_radius=ld_radius, verbose=verbose, num_iter=num_iter, burn_in=burn_in, ld_dict=chrom_ld_dict[chrom_str], start_betas=LDpred_inf_chrom_dict[chrom_str], zero_jump_prob=zero_jump_prob) updated_betas = res_dict['betas'] updated_inf_betas = res_dict['inf_betas'] sum_sqr_effects = sp.sum(updated_betas**2) if sum_sqr_effects > herit_dict['gw_h2_ld_score_est']: print( 'Sum of squared updated effects estimates seems too large: %0.4f' % sum_sqr_effects) print( 'This suggests that the Gibbs sampler did not convergence.' ) convergence_report[p] = True if verbose: print('Calculating SNP weights for Chromosome %s' % ((chrom_str.split('_'))[1])) else: sys.stdout.write('\b\b\b\b\b\b\b%0.2f%%' % (100.0 * (min(1, float(chrom_i + 1) / num_chrom)))) sys.stdout.flush() updated_betas = updated_betas / (snp_stds.flatten()) updated_inf_betas = updated_inf_betas / (snp_stds.flatten()) ldpred_effect_sizes.extend(updated_betas) ldpred_inf_effect_sizes.extend(updated_inf_betas) if verbose and has_phenotypes: prs = sp.dot(updated_betas, raw_snps) risk_scores_pval_derived += prs corr = sp.corrcoef(y, prs)[0, 1] r2 = corr**2 print( 'The R2 prediction accuracy of PRS using %s was: %0.4f' % (chrom_str, r2)) if verbose and has_phenotypes: num_indivs = len(y) results_dict[p_str]['y'] = y results_dict[p_str]['risk_scores_pd'] = risk_scores_pval_derived print('Prediction accuracy was assessed using %d individuals.' % (num_indivs)) corr = sp.corrcoef(y, risk_scores_pval_derived)[0, 1] r2 = corr**2 results_dict[p_str]['r2_pd'] = r2 print( 'The R2 prediction accuracy (observed scale) for the whole genome was: %0.4f (%0.6f)' % (r2, ((1 - r2)**2) / num_indivs)) if corr < 0: risk_scores_pval_derived = -1 * risk_scores_pval_derived auc = util.calc_auc(y, risk_scores_pval_derived) print('AUC for the whole genome was: %0.4f' % auc) # Now calibration denominator = sp.dot(risk_scores_pval_derived.T, risk_scores_pval_derived) y_norm = (y - sp.mean(y)) / sp.std(y) numerator = sp.dot(risk_scores_pval_derived.T, y_norm) regression_slope = (numerator / denominator) # [0][0] print( 'The slope for predictions with P-value derived effects is: %0.4f' % regression_slope) results_dict[p_str]['slope_pd'] = regression_slope weights_out_file = '%s_LDpred_p%0.4e.txt' % (out_file_prefix, p) with open(weights_out_file, 'w') as f: f.write( 'chrom pos sid nt1 nt2 raw_beta ldpred_beta\n' ) for chrom, pos, sid, nt, raw_beta, ldpred_beta in zip( chromosomes, out_positions, out_sids, out_nts, raw_effect_sizes, ldpred_effect_sizes): nt1, nt2 = nt[0], nt[1] f.write('%s %d %s %s %s %0.4e %0.4e\n' % (chrom, pos, sid, nt1, nt2, raw_beta, ldpred_beta)) weights_out_file = '%s_LDpred-inf.txt' % (out_file_prefix) with open(weights_out_file, 'w') as f: f.write( 'chrom pos sid nt1 nt2 raw_beta ldpred_inf_beta \n' ) for chrom, pos, sid, nt, raw_beta, ldpred_inf_beta in zip( chromosomes, out_positions, out_sids, out_nts, raw_effect_sizes, ldpred_inf_effect_sizes): nt1, nt2 = nt[0], nt[1] f.write('%s %d %s %s %s %0.4e %0.4e\n' % (chrom, pos, sid, nt1, nt2, raw_beta, ldpred_inf_beta)) summary_dict[2.0] = { 'name': 'Gibbs sampler fractions used', 'value': str(ps) } ['Yes' if convergence_report[p] else 'No' for p in ps] summary_dict[2.1] = { 'name': 'Convergence issues (for each fraction)', 'value': str(['Yes' if convergence_report[p] else 'No' for p in ps]) }
def ldpred_gibbs(beta_hats, genotypes=None, start_betas=None, h2=None, n=1000, ld_radius=100, num_iter=60, burn_in=10, p=None, zero_jump_prob=0.05, tight_sampling=False, ld_dict=None, reference_ld_mats=None, ld_boundaries=None, verbose=False): """ LDpred (Gibbs Sampler) """ t0 = time.time() m = len(beta_hats) n = float(n) # If no starting values for effects were given, then use the infinitesimal model starting values. if start_betas is None and verbose: print( 'Initializing LDpred effects with posterior mean LDpred-inf effects.' ) print('Calculating LDpred-inf effects.') start_betas = LDpred_inf.ldpred_inf( beta_hats, genotypes=genotypes, reference_ld_mats=reference_ld_mats, h2=h2, n=n, ld_window_size=2 * ld_radius, verbose=False) curr_betas = sp.copy(start_betas) assert len( curr_betas ) == m, 'Betas returned by LDpred_inf do not have the same length as expected.' curr_post_means = sp.zeros(m) avg_betas = sp.zeros(m) # Iterating over effect estimates in sequential order iter_order = sp.arange(m) # Setting up the marginal Bayes shrink Mp = m * p hdmp = (h2 / Mp) hdmpn = hdmp + 1.0 / n hdmp_hdmpn = (hdmp / hdmpn) c_const = (p / sp.sqrt(hdmpn)) d_const = (1.0 - p) / (sp.sqrt(1.0 / n)) for k in range(num_iter): # Big iteration # Force an alpha shrink if estimates are way off compared to heritability estimates. (Improves MCMC convergence.) h2_est = max(0.00001, sp.sum(curr_betas**2)) if tight_sampling: alpha = min(1.0 - zero_jump_prob, 1.0 / h2_est, (h2 + 1.0 / sp.sqrt(n)) / h2_est) else: alpha = 1.0 - zero_jump_prob rand_ps = sp.random.random(m) rand_norms = stats.norm.rvs(0.0, (hdmp_hdmpn) * (1.0 / n), size=m) if ld_boundaries is None: for i, snp_i in enumerate(iter_order): start_i = max(0, snp_i - ld_radius) focal_i = min(ld_radius, snp_i) stop_i = min(m, snp_i + ld_radius + 1) # Local LD matrix D_i = ld_dict[snp_i] # Local (most recently updated) effect estimates local_betas = curr_betas[start_i:stop_i] # Calculate the local posterior mean, used when sampling. local_betas[focal_i] = 0.0 res_beta_hat_i = beta_hats[snp_i] - sp.dot(D_i, local_betas) b2 = res_beta_hat_i**2 d_const_b2_exp = d_const * sp.exp(-b2 * n / 2.0) if sp.isreal(d_const_b2_exp): numerator = c_const * sp.exp(-b2 / (2.0 * hdmpn)) if sp.isreal(numerator): if numerator == 0.0: postp = 0.0 else: postp = numerator / (numerator + d_const_b2_exp) assert sp.isreal( postp ), 'The posterior mean is not a real number? Possibly due to problems with summary stats, LD estimates, or parameter settings.' else: postp = 0.0 else: postp = 1.0 curr_post_means[snp_i] = hdmp_hdmpn * postp * res_beta_hat_i if rand_ps[i] < postp * alpha: # Sample from the posterior Gaussian dist. proposed_beta = rand_norms[i] + hdmp_hdmpn * res_beta_hat_i else: # Sample 0 proposed_beta = 0.0 curr_betas[snp_i] = proposed_beta # UPDATE BETA else: for i, snp_i in enumerate(iter_order): start_i = ld_boundaries[snp_i][0] stop_i = ld_boundaries[snp_i][1] focal_i = snp_i - start_i # Local LD matrix D_i = ld_dict[snp_i] # Local (most recently updated) effect imates local_betas = curr_betas[start_i:stop_i] # Calculate the local posterior mean, used when sampling. local_betas[focal_i] = 0.0 res_beta_hat_i = beta_hats[snp_i] - sp.dot(D_i, local_betas) b2 = res_beta_hat_i**2 d_const_b2_exp = d_const * sp.exp(-b2 * n / 2.0) if sp.isreal(d_const_b2_exp): numerator = c_const * sp.exp(-b2 / (2.0 * hdmpn)) if sp.isreal(numerator): if numerator == 0.0: postp = 0.0 else: postp = numerator / (numerator + d_const_b2_exp) assert sp.isreal( postp ), 'Posterior mean is not a real number? Possibly due to problems with summary stats, LD estimates, or parameter settings.' else: postp = 0.0 else: postp = 1.0 curr_post_means[snp_i] = hdmp_hdmpn * postp * res_beta_hat_i if rand_ps[i] < postp * alpha: # Sample from the posterior Gaussian dist. proposed_beta = rand_norms[i] + hdmp_hdmpn * res_beta_hat_i else: # Sample 0 proposed_beta = 0.0 curr_betas[snp_i] = proposed_beta # UPDATE BETA if verbose: sys.stdout.write('\b\b\b\b\b\b\b%0.2f%%' % (100.0 * (min(1, float(k + 1) / num_iter)))) sys.stdout.flush() if k >= burn_in: avg_betas += curr_post_means # Averaging over the posterior means instead of samples. avg_betas = avg_betas / float(num_iter - burn_in) t1 = time.time() t = (t1 - t0) if verbose: print('\nTook %d minutes and %0.2f seconds' % (t / 60, t % 60)) return {'betas': avg_betas, 'inf_betas': start_betas}
def ldpred_fast_genomewide(data_file=None, ld_radius=None, ld_dict=None, out_file_prefix=None, ps=None, n=None, h2=None, use_gw_h2=False, eff_type=None, summary_dict=None, debug=False): """ Calculate LDpred for a genome """ print('Applying LDpred-fast') df = h5py.File(data_file, 'r') cord_data_g = df['cord_data'] mean_n = coord_genotypes.get_mean_sample_size(n, cord_data_g) herit_dict = get_herit_dict(df, eff_type, mean_n, h2, ld_dict=ld_dict, summary_dict=summary_dict, debug=debug) results_cdict = init_res_dict(out_file_prefix, cord_data_g) blup_betas_chrom_dict = {} if eff_type != 'BLUP': chrom_ref_ld_mats = ld_dict['chrom_ref_ld_mats'] print('Calculating LDpred-inf weights') for chrom_str in util.chromosomes_list: if chrom_str in cord_data_g: if debug: print('Calculating LDpred-inf weights for Chromosome %s' % ((chrom_str.split('_'))[1])) g = cord_data_g[chrom_str] # Filter monomorphic SNPs snp_stds = g['snp_stds_ref'][...] snp_stds = snp_stds.flatten() ok_snps_filter = snp_stds > 0 pval_derived_betas = g['betas'][...] pval_derived_betas = pval_derived_betas[ok_snps_filter] h2_chrom = herit_dict[chrom_str]['h2'] blup_betas = LDpred_inf.ldpred_inf( pval_derived_betas, genotypes=None, reference_ld_mats=chrom_ref_ld_mats[chrom_str], h2=h2_chrom, n=mean_n, ld_window_size=2 * ld_radius, verbose=debug) snp_stds = g['snp_stds_ref'][...] snp_stds = snp_stds.flatten() ldpred_inf_betas = blup_betas / snp_stds results_cdict[chrom_str]['ldpred_inf_betas'] = ldpred_inf_betas blup_betas_chrom_dict[chrom_str] = blup_betas else: raise NotImplementedError chrom_i = 0 num_chrom = len(cord_data_g.keys()) for chrom_str in util.chromosomes_list: chrom_i += 1 if chrom_str in cord_data_g: g = cord_data_g[chrom_str] h2_chrom = herit_dict[chrom_str]['h2'] ns = g['ns'][...] blup_betas = blup_betas_chrom_dict[chrom_str] if debug: print('Calculating LDpred-fast weights for Chromosome %s' % ((chrom_str.split('_'))[1])) for p_c in ps: p_str = '%0.4f' % p_c ldpred_fast_betas = ldpred_fast(blup_betas, h2=h2_chrom, Ns=ns, p_c=p_c) results_cdict[chrom_str]['ldpred_fast_betas_dict'][ p_str] = ldpred_fast_betas / snp_stds if not debug: sys.stdout.write('\r%0.2f%%' % (100.0 * (min(1, float(chrom_i) / num_chrom)))) sys.stdout.flush() write_res_dict_2_file(out_file_prefix, results_cdict, ps) if not debug: sys.stdout.write('\r%0.2f%%\n' % (100.0)) sys.stdout.flush() summary_dict[2.0] = { 'name': 'LDpred-fast fractions used', 'value': str(ps) }
def ldpred_genomewide(data_file=None, ld_radius=None, ld_dict=None, out_file_prefix=None, summary_dict=None, ps=None, n=None, h2=None, num_iter=None, verbose=False, zero_jump_prob=0.05, burn_in=5): """ Calculate LDpred for a genome """ df = h5py.File(data_file, 'r') has_phenotypes = False if 'y' in df: 'Validation phenotypes found.' y = df['y'][...] # Phenotype num_individs = len(y) risk_scores_pval_derived = sp.zeros(num_individs) has_phenotypes = True ld_scores_dict = ld_dict['ld_scores_dict'] chrom_ld_dict = ld_dict['chrom_ld_dict'] chrom_ref_ld_mats = ld_dict['chrom_ref_ld_mats'] print('Applying LDpred with LD radius: %d' % ld_radius) results_dict = {} cord_data_g = df['cord_data'] #Calculating genome-wide heritability using LD score regression, and partition heritability by chromsomes herit_dict = ld.get_chromosome_herits(cord_data_g, ld_scores_dict, n, h2=h2, debug=verbose,summary_dict=summary_dict) LDpred_inf_chrom_dict = {} print('Calculating LDpred-inf weights') for chrom_str in util.chromosomes_list: if chrom_str in cord_data_g: print('Calculating SNP weights for Chromosome %s' % ((chrom_str.split('_'))[1])) g = cord_data_g[chrom_str] # Filter monomorphic SNPs snp_stds = g['snp_stds_ref'][...] snp_stds = snp_stds.flatten() ok_snps_filter = snp_stds > 0 pval_derived_betas = g['betas'][...] pval_derived_betas = pval_derived_betas[ok_snps_filter] h2_chrom = herit_dict[chrom_str] start_betas = LDpred_inf.ldpred_inf(pval_derived_betas, genotypes=None, reference_ld_mats=chrom_ref_ld_mats[chrom_str], h2=h2_chrom, n=n, ld_window_size=2 * ld_radius, verbose=False) LDpred_inf_chrom_dict[chrom_str] = start_betas convergence_report = {} for p in ps: convergence_report[p] = False print('Starting LDpred gibbs with f=%0.4f' % p) p_str = '%0.4f' % p results_dict[p_str] = {} if out_file_prefix: # Preparing output files raw_effect_sizes = [] ldpred_effect_sizes = [] ldpred_inf_effect_sizes = [] out_sids = [] chromosomes = [] out_positions = [] out_nts = [] chrom_i = 0 num_chrom = len(util.chromosomes_list) for chrom_str in util.chromosomes_list: chrom_i+=1 if chrom_str in cord_data_g: g = cord_data_g[chrom_str] if verbose and has_phenotypes: if 'raw_snps_val' in g: raw_snps = g['raw_snps_val'][...] else: raw_snps = g['raw_snps_ref'][...] # Filter monomorphic SNPs snp_stds = g['snp_stds_ref'][...] snp_stds = snp_stds.flatten() pval_derived_betas = g['betas'][...] positions = g['positions'][...] sids = (g['sids'][...]).astype(util.sids_u_dtype) log_odds = g['log_odds'][...] nts = (g['nts'][...]).astype(util.nts_u_dtype) ok_snps_filter = snp_stds > 0 if not sp.all(ok_snps_filter): snp_stds = snp_stds[ok_snps_filter] pval_derived_betas = pval_derived_betas[ok_snps_filter] positions = positions[ok_snps_filter] sids = sids[ok_snps_filter] log_odds = log_odds[ok_snps_filter] nts = nts[ok_snps_filter] if verbose and has_phenotypes: raw_snps = raw_snps[ok_snps_filter] if out_file_prefix: chromosomes.extend([chrom_str] * len(pval_derived_betas)) out_positions.extend(positions) out_sids.extend(sids) raw_effect_sizes.extend(log_odds) out_nts.extend(nts) h2_chrom = herit_dict[chrom_str] if 'chrom_ld_boundaries' in ld_dict: ld_boundaries = ld_dict['chrom_ld_boundaries'][chrom_str] res_dict = ldpred_gibbs(pval_derived_betas, h2=h2_chrom, n=n, p=p, ld_radius=ld_radius, verbose=verbose, num_iter=num_iter, burn_in=burn_in, ld_dict=chrom_ld_dict[chrom_str], start_betas=LDpred_inf_chrom_dict[chrom_str], ld_boundaries=ld_boundaries, zero_jump_prob=zero_jump_prob, print_progress=False) else: res_dict = ldpred_gibbs(pval_derived_betas, h2=h2_chrom, n=n, p=p, ld_radius=ld_radius, verbose=verbose, num_iter=num_iter, burn_in=burn_in, ld_dict=chrom_ld_dict[chrom_str], start_betas=LDpred_inf_chrom_dict[chrom_str], zero_jump_prob=zero_jump_prob, print_progress=False) updated_betas = res_dict['betas'] updated_inf_betas = res_dict['inf_betas'] sum_sqr_effects = sp.sum(updated_betas ** 2) if sum_sqr_effects > herit_dict['gw_h2_ld_score_est']: print('Sum of squared updated effects estimates seems too large: %0.4f'% sum_sqr_effects) print('This suggests that the Gibbs sampler did not convergence.') convergence_report[p] = True if verbose: print('Calculating SNP weights for Chromosome %s' % ((chrom_str.split('_'))[1])) else: sys.stdout.write('\b\b\b\b\b\b\b%0.2f%%' % (100.0 * (min(1, float(chrom_i + 1) / num_chrom)))) sys.stdout.flush() updated_betas = updated_betas / (snp_stds.flatten()) updated_inf_betas = updated_inf_betas / (snp_stds.flatten()) ldpred_effect_sizes.extend(updated_betas) ldpred_inf_effect_sizes.extend(updated_inf_betas) if verbose and has_phenotypes: prs = sp.dot(updated_betas, raw_snps) risk_scores_pval_derived += prs corr = sp.corrcoef(y, prs)[0, 1] r2 = corr ** 2 print('The R2 prediction accuracy of PRS using %s was: %0.4f' % (chrom_str, r2)) if not verbose: sys.stdout.write('\b\b\b\b\b\b\b%0.2f%%\n' % (100.0)) sys.stdout.flush() if verbose and has_phenotypes: num_indivs = len(y) results_dict[p_str]['y'] = y results_dict[p_str]['risk_scores_pd'] = risk_scores_pval_derived print('Prediction accuracy was assessed using %d individuals.' % (num_indivs)) corr = sp.corrcoef(y, risk_scores_pval_derived)[0, 1] r2 = corr ** 2 results_dict[p_str]['r2_pd'] = r2 print('The R2 prediction accuracy (observed scale) for the whole genome was: %0.4f (%0.6f)' % (r2, ((1 - r2) ** 2) / num_indivs)) if corr < 0: risk_scores_pval_derived = -1 * risk_scores_pval_derived auc = util.calc_auc(y, risk_scores_pval_derived) print('AUC for the whole genome was: %0.4f' % auc) # Now calibration denominator = sp.dot(risk_scores_pval_derived.T, risk_scores_pval_derived) y_norm = (y - sp.mean(y)) / sp.std(y) numerator = sp.dot(risk_scores_pval_derived.T, y_norm) regression_slope = (numerator / denominator) # [0][0] print('The slope for predictions with P-value derived effects is: %0.4f' % regression_slope) results_dict[p_str]['slope_pd'] = regression_slope weights_out_file = '%s_LDpred_p%0.4e.txt' % (out_file_prefix, p) with open(weights_out_file, 'w') as f: f.write('chrom pos sid nt1 nt2 raw_beta ldpred_beta\n') for chrom, pos, sid, nt, raw_beta, ldpred_beta in zip(chromosomes, out_positions, out_sids, out_nts, raw_effect_sizes, ldpred_effect_sizes): nt1, nt2 = nt[0], nt[1] f.write('%s %d %s %s %s %0.4e %0.4e\n' % (chrom, pos, sid, nt1, nt2, raw_beta, ldpred_beta)) weights_out_file = '%s_LDpred-inf.txt' % (out_file_prefix) with open(weights_out_file, 'w') as f: f.write('chrom pos sid nt1 nt2 raw_beta ldpred_inf_beta \n') for chrom, pos, sid, nt, raw_beta, ldpred_inf_beta in zip(chromosomes, out_positions, out_sids, out_nts, raw_effect_sizes, ldpred_inf_effect_sizes): nt1, nt2 = nt[0], nt[1] f.write('%s %d %s %s %s %0.4e %0.4e\n' % (chrom, pos, sid, nt1, nt2, raw_beta, ldpred_inf_beta)) summary_dict[2.0]={'name':'Gibbs sampler fractions used','value':str(ps)} ['Yes' if convergence_report[p] else 'No' for p in ps] summary_dict[2.1]={'name':'Convergence issues (for each fraction)','value':str(['Yes' if convergence_report[p] else 'No' for p in ps])}
def ldpred_gibbs(beta_hats, genotypes=None, start_betas=None, h2=None, n=1000, ld_radius=100, num_iter=60, burn_in=10, p=None, zero_jump_prob=0.05, tight_sampling=False, ld_dict=None, reference_ld_mats=None, ld_boundaries=None, verbose=False, print_progress=True): """ LDpred (Gibbs Sampler) """ t0 = time.time() m = len(beta_hats) n = float(n) # If no starting values for effects were given, then use the infinitesimal model starting values. if start_betas is None and verbose: print('Initializing LDpred effects with posterior mean LDpred-inf effects.') print('Calculating LDpred-inf effects.') start_betas = LDpred_inf.ldpred_inf(beta_hats, genotypes=genotypes, reference_ld_mats=reference_ld_mats, h2=h2, n=n, ld_window_size=2 * ld_radius, verbose=False) curr_betas = sp.copy(start_betas) assert len(curr_betas)==m,'Betas returned by LDpred_inf do not have the same length as expected.' curr_post_means = sp.zeros(m) avg_betas = sp.zeros(m) # Iterating over effect estimates in sequential order iter_order = sp.arange(m) # Setting up the marginal Bayes shrink Mp = m * p hdmp = (h2 / Mp) hdmpn = hdmp + 1.0 / n hdmp_hdmpn = (hdmp / hdmpn) c_const = (p / sp.sqrt(hdmpn)) d_const = (1.0 - p) / (sp.sqrt(1.0 / n)) for k in range(num_iter): # Big iteration # Force an alpha shrink if estimates are way off compared to heritability estimates. (Improves MCMC convergence.) h2_est = max(0.00001, sp.sum(curr_betas ** 2)) if tight_sampling: alpha = min(1.0 - zero_jump_prob, 1.0 / h2_est, (h2 + 1.0 / sp.sqrt(n)) / h2_est) else: alpha = 1.0 - zero_jump_prob rand_ps = sp.random.random(m) rand_norms = stats.norm.rvs(0.0, (hdmp_hdmpn) * (1.0 / n), size=m) if ld_boundaries is None: for i, snp_i in enumerate(iter_order): start_i = max(0, snp_i - ld_radius) focal_i = min(ld_radius, snp_i) stop_i = min(m, snp_i + ld_radius + 1) # Local LD matrix D_i = ld_dict[snp_i] # Local (most recently updated) effect estimates local_betas = curr_betas[start_i: stop_i] # Calculate the local posterior mean, used when sampling. local_betas[focal_i] = 0.0 res_beta_hat_i = beta_hats[snp_i] - sp.dot(D_i , local_betas) b2 = res_beta_hat_i ** 2 d_const_b2_exp = d_const * sp.exp(-b2 * n / 2.0) if sp.isreal(d_const_b2_exp): numerator = c_const * sp.exp(-b2 / (2.0 * hdmpn)) if sp.isreal(numerator): if numerator == 0.0: postp = 0.0 else: postp = numerator / (numerator + d_const_b2_exp) assert sp.isreal(postp), 'The posterior mean is not a real number? Possibly due to problems with summary stats, LD estimates, or parameter settings.' else: postp = 0.0 else: postp = 1.0 curr_post_means[snp_i] = hdmp_hdmpn * postp * res_beta_hat_i if rand_ps[i] < postp * alpha: # Sample from the posterior Gaussian dist. proposed_beta = rand_norms[i] + hdmp_hdmpn * res_beta_hat_i else: # Sample 0 proposed_beta = 0.0 curr_betas[snp_i] = proposed_beta # UPDATE BETA else: for i, snp_i in enumerate(iter_order): start_i = ld_boundaries[snp_i][0] stop_i = ld_boundaries[snp_i][1] focal_i = snp_i - start_i # Local LD matrix D_i = ld_dict[snp_i] # Local (most recently updated) effect imates local_betas = curr_betas[start_i: stop_i] # Calculate the local posterior mean, used when sampling. local_betas[focal_i] = 0.0 res_beta_hat_i = beta_hats[snp_i] - sp.dot(D_i , local_betas) b2 = res_beta_hat_i ** 2 d_const_b2_exp = d_const * sp.exp(-b2 * n / 2.0) if sp.isreal(d_const_b2_exp): numerator = c_const * sp.exp(-b2 / (2.0 * hdmpn)) if sp.isreal(numerator): if numerator == 0.0: postp = 0.0 else: postp = numerator / (numerator + d_const_b2_exp) assert sp.isreal(postp), 'Posterior mean is not a real number? Possibly due to problems with summary stats, LD estimates, or parameter settings.' else: postp = 0.0 else: postp = 1.0 curr_post_means[snp_i] = hdmp_hdmpn * postp * res_beta_hat_i if rand_ps[i] < postp * alpha: # Sample from the posterior Gaussian dist. proposed_beta = rand_norms[i] + hdmp_hdmpn * res_beta_hat_i else: # Sample 0 proposed_beta = 0.0 curr_betas[snp_i] = proposed_beta # UPDATE BETA if verbose and print_progress: sys.stdout.write('\b\b\b\b\b\b\b%0.2f%%' % (100.0 * (min(1, float(k + 1) / num_iter)))) sys.stdout.flush() if k >= burn_in: avg_betas += curr_post_means # Averaging over the posterior means instead of samples. if verbose and print_progress: sys.stdout.write('\b\b\b\b\b\b\b%0.2f%%\n' % (100.0)) sys.stdout.flush() avg_betas = avg_betas / float(num_iter - burn_in) t1 = time.time() t = (t1 - t0) if verbose: print('Took %d minutes and %0.2f seconds' % (t / 60, t % 60)) return {'betas':avg_betas, 'inf_betas':start_betas}
def ldpred_gibbs(beta_hats, genotypes=None, start_betas=None, h2=None, n=None, ns=None, ld_radius=100, num_iter=60, burn_in=10, p=None, zero_jump_prob=0.01, sampl_var_shrink_factor=0.9, tight_sampling=False, ld_dict=None, reference_ld_mats=None, ld_boundaries=None, verbose=False, print_progress=True): """ LDpred (Gibbs Sampler) """ # Set random seed to stabilize results sp.random.seed(42) t0 = time.time() m = len(beta_hats) ldpred_n, ldpred_inf_n = get_LDpred_sample_size(n, ns, verbose) # If no starting values for effects were given, then use the infinitesimal model starting values. if start_betas is None and verbose: print( 'Initializing LDpred effects with posterior mean LDpred-inf effects.' ) print('Calculating LDpred-inf effects.') start_betas = LDpred_inf.ldpred_inf( beta_hats, genotypes=genotypes, reference_ld_mats=reference_ld_mats, h2=h2, n=ldpred_inf_n, ld_window_size=2 * ld_radius, verbose=False) curr_betas = sp.copy(start_betas) assert len( curr_betas ) == m, 'Betas returned by LDpred_inf do not have the same length as expected.' curr_post_means = sp.zeros(m) avg_betas = sp.zeros(m) # Iterating over effect estimates in sequential order iter_order = sp.arange(m) # Setting up the marginal Bayes shrink const_dict = prepare_constants(ldpred_n, ns, m, p, h2, sampl_var_shrink_factor) for k in range(num_iter): # Big iteration h2_est = max(0.00001, sp.sum(curr_betas**2)) if tight_sampling: # Force an alpha shrink if estimates are way off compared to heritability estimates. #(May improve MCMC convergence.) alpha = min(1.0 - zero_jump_prob, 1.0 / h2_est, (h2 + 1.0 / sp.sqrt(ldpred_n)) / h2_est) else: alpha = 1.0 - zero_jump_prob rand_ps = sp.random.random(m) rand_norms = stats.norm.rvs(0.0, 1, size=m) * const_dict['rv_scalars'] for i, snp_i in enumerate(iter_order): if ld_boundaries is None: start_i = max(0, snp_i - ld_radius) focal_i = min(ld_radius, snp_i) stop_i = min(m, snp_i + ld_radius + 1) else: start_i = ld_boundaries[snp_i][0] stop_i = ld_boundaries[snp_i][1] focal_i = snp_i - start_i #Figure out what sample size and constants to use cd = get_constants(snp_i, const_dict) # Local LD matrix D_i = ld_dict[snp_i] # Local (most recently updated) effect estimates local_betas = curr_betas[start_i:stop_i] # Calculate the local posterior mean, used when sampling. local_betas[focal_i] = 0.0 res_beta_hat_i = beta_hats[snp_i] - sp.dot(D_i, local_betas) b2 = res_beta_hat_i**2 d_const_b2_exp = cd['d_const'] * sp.exp(-b2 * cd['n'] / 2.0) if sp.isreal(d_const_b2_exp): numerator = cd['c_const'] * sp.exp(-b2 / (2.0 * cd['hdmpn'])) if sp.isreal(numerator): if numerator == 0.0: postp = 0.0 else: postp = numerator / (numerator + d_const_b2_exp) assert sp.isreal( postp ), 'The posterior mean is not a real number? Possibly due to problems with summary stats, LD estimates, or parameter settings.' else: postp = 0.0 else: postp = 1.0 curr_post_means[snp_i] = cd['hdmp_hdmpn'] * postp * res_beta_hat_i if rand_ps[i] < postp * alpha: # Sample from the posterior Gaussian dist. proposed_beta = rand_norms[ snp_i] + cd['hdmp_hdmpn'] * res_beta_hat_i else: # Sample 0 proposed_beta = 0.0 curr_betas[snp_i] = proposed_beta # UPDATE BETA if verbose and print_progress: sys.stdout.write('\r%0.2f%%' % (100.0 * (min(1, float(k + 1) / num_iter)))) sys.stdout.flush() if k >= burn_in: avg_betas += curr_post_means # Averaging over the posterior means instead of samples. if verbose and print_progress: sys.stdout.write('\r%0.2f%%\n' % (100.0)) sys.stdout.flush() avg_betas = avg_betas / float(num_iter - burn_in) t1 = time.time() t = (t1 - t0) if verbose: print('Took %d minutes and %0.2f seconds' % (t / 60, t % 60)) return {'betas': avg_betas, 'inf_betas': start_betas}