def calculate_qvalues(res_df, fdr=0.05, qvalue_lambda=None, logger=None): """Annotate permutation results with q-values, p-value threshold""" if logger is None: logger = SimpleLogger() logger.write('Computing q-values') logger.write(f' * Number of phenotypes tested: {res_df.shape[0]}') r = stats.pearsonr(res_df['pval_perm'], res_df['pval_beta'])[0] logger.write(f' * Correlation between Beta-approximated and empirical p-values: : {r:.4f}') # calculate q-values if qvalue_lambda is None: qval, pi0 = rfunc.qvalue(res_df['pval_beta']) else: logger.write(f' * Calculating q-values with lambda = {qvalue_lambda:.3f}') qval, pi0 = rfunc.qvalue(res_df['pval_beta'], qvalue_lambda) res_df['qval'] = qval logger.write(f' * Proportion of significant phenotypes (1-pi0): {1-pi0:.2f}') logger.write(f" * QTL phenotypes @ FDR {fdr:.2f}: {(res_df['qval'] <= fdr).sum()}") # determine global min(p) significance threshold and calculate nominal p-value threshold for each gene lb = res_df.loc[res_df['qval']<=fdr, 'pval_beta'].sort_values() ub = res_df.loc[res_df['qval']>fdr, 'pval_beta'].sort_values() if lb.shape[0] > 0: # significant phenotypes lb = lb[-1] if ub.shape[0] > 0: ub = ub[0] pthreshold = (lb+ub)/2 else: pthreshold = lb logger.write(f' * min p-value threshold @ FDR {fdr}: {pthreshold:.6g}') res_df['pval_nominal_threshold'] = stats.beta.ppf(pthreshold, res_df['beta_shape1'], res_df['beta_shape2'])
def calculate_qvalues(res_df, fdr=0.05, qvalue_lambda=None, logger=None): """Annotate permutation results with q-values, p-value threshold""" if logger is None: logger = SimpleLogger() logger.write('Computing q-values') logger.write(' * Number of phenotypes tested: {}'.format(res_df.shape[0])) logger.write( ' * Correlation between Beta-approximated and empirical p-values: : {:.4f}' .format(stats.pearsonr(res_df['pval_perm'], res_df['pval_beta'])[0])) # calculate q-values if qvalue_lambda is None: qval, pi0 = rfunc.qvalue(res_df['pval_beta']) else: logger.write(' * Calculating q-values with lambda = {:.3f}'.format( qvalue_lambda)) qval, pi0 = rfunc.qvalue(res_df['pval_beta'], qvalue_lambda) res_df['qval'] = qval logger.write( ' * Proportion of significant phenotypes (1-pi0): {:.2f}'.format(1 - pi0)) logger.write(' * QTL phenotypes @ FDR {:.2f}: {}'.format( fdr, np.sum(res_df['qval'] <= fdr))) # determine global min(p) significance threshold and calculate nominal p-value threshold for each gene ub = res_df.loc[res_df['qval'] > fdr, 'pval_beta'].sort_values()[0] lb = res_df.loc[res_df['qval'] <= fdr, 'pval_beta'].sort_values()[-1] pthreshold = (lb + ub) / 2 logger.write(' * min p-value threshold @ FDR {}: {:.6g}'.format( fdr, pthreshold)) res_df['pval_nominal_threshold'] = stats.beta.ppf(pthreshold, res_df['beta_shape1'], res_df['beta_shape2'])
def calculate_qvalues(res_df, fdr=0.05, qvalue_lambda=None, logger=None): """Annotate permutation results with q-values, p-value threshold""" if logger is None: logger = SimpleLogger() logger.write('Computing q-values') logger.write(' * Number of phenotypes tested: {}'.format(res_df.shape[0])) # logger.write(' * Correlation between Beta-approximated and empirical p-values: : {:.4f}'.format( # stats.pearsonr(res_df['pval_perm'], res_df['pval_beta'])[0])) pval_perm = np.array(res_df['pval_perm'], dtype=float) pval_beta = np.array(res_df['pval_beta'], dtype=float) is_finite_1 = np.isfinite(pval_perm) is_finite_2 = np.isfinite(pval_beta) is_finite = is_finite_1 & is_finite_2 logger.write( ' * Correlation between Beta-approximated and empirical p-values: : {:.4f}' .format(stats.pearsonr(pval_perm[is_finite], pval_beta[is_finite])[0])) # calculate q-values if qvalue_lambda is None: qval, pi0 = rfunc.qvalue(res_df['pval_beta']) else: logger.write(' * Calculating q-values with lambda = {:.3f}'.format( qvalue_lambda)) qval, pi0 = rfunc.qvalue(res_df['pval_beta'], qvalue_lambda) res_df['qval'] = qval logger.write( ' * Proportion of significant phenotypes (1-pi0): {:.2f}'.format(1 - pi0)) # logger.write(' * QTL phenotypes @ FDR {:.2f}: {}'.format(fdr, np.sum(res_df['qval']<=fdr))) logger.write(' * QTL phenotypes @ FDR {:.2f}: {}'.format( fdr, np.nansum(res_df['qval'] <= fdr))) # determine global min(p) significance threshold and calculate nominal p-value threshold for each gene lb = res_df.loc[res_df['qval'] <= fdr, 'pval_beta'].sort_values() ub = res_df.loc[res_df['qval'] > fdr, 'pval_beta'].sort_values() if lb.shape[0] > 0: # significant phenotypes lb = list(lb) lb = lb[-1] if ub.shape[0] > 0: ub = list(ub) ub = ub[0] pthreshold = (lb + ub) / 2 else: pthreshold = lb logger.write(' * min p-value threshold @ FDR {}: {:.6g}'.format( fdr, pthreshold)) res_df['pval_nominal_threshold'] = stats.beta.ppf( pthreshold, res_df['beta_shape1'], res_df['beta_shape2'])
# load eqtl summary statistics logging.info('Loading eQTL table.') df = read_table(input_file) # pool over gene logging.info('Looping over genes.') res = [] res_pi0 = [] genes = df[pheno_col].unique() for gene in tqdm(genes): df_i = df[df[pheno_col] == gene].reset_index(drop=True) df_i = load_pvalue(df_i, mode=args.mode) if df_i.shape[0] < 1: continue try: qval, pi0 = rfunc.qvalue(df_i.pval.values) except: logging.info(f'Failed on {gene}') continue tmp = df_i[[pheno_col, variant_col, 'pval']].copy() tmp['qval'] = qval res_pi0.append(pd.DataFrame({'phenotype_id': [gene], 'pi0': [pi0]})) res.append(tmp[tmp.qval < args.fdr_cutoff].reset_index(drop=True)) res = pd.concat(res, axis=0) res_pi0 = pd.concat(res_pi0, axis=0) # save output logging.info('Writing output to disk.') res.to_parquet(args.output) res_pi0.to_csv(args.output_pi0, index=False)