def single_mahalanobis(sim_data, real_data, features=FEATURE_COL, klass=KLASS_COL): """Classify single variant using Mahalanobis distance""" assert real_data.shape[0] == 1, "Real data should have just one variant" with warnings.catch_warnings(): warnings.filterwarnings("ignore") x = sim_data[features] y = sim_data[klass] real_x = real_data[features] def mahal_score(data): if data.shape[0] < 2: # Insufficient data for calculation return float("inf") robust_cov = MinCovDet(assume_centered=False).fit(data) return robust_cov.mahalanobis(real_x)[0] score = x.groupby(y).apply(mahal_score) assert len( score) == 3, "Missing classes in Mahalanobis distance calculation" pred = score.idxmin() with np.errstate(under="ignore"): prob = chi2.logsf(score, len(features)) prob = np.exp(prob - logsumexp(prob)) return (pred, prob, score)
def outarray_effect(est, ses, freqs, vy): N_effective = vy / (2 * freqs * (1 - freqs) * np.power(ses, 2)) Z = est / ses P = -log10(np.exp(1)) * chi2.logsf(np.power(Z, 2), 1) array_out = np.column_stack((N_effective, est, ses, Z, P)) array_out = np.round(array_out, decimals=6) array_out[:, 0] = np.round(array_out[:, 0], 0) return array_out
def ErrorRate(n, l, q, g, sigma, t): div_t = 0 avg_t = 0. for i in xrange(-2**(t - 1), 2**(t - 1) + 1): div_t += (1. * i - 0)**2 div_t /= 2.**t + 1 #print t, div_t s = sqrt(n * l * sigma**2 * (sigma**2 + div_t) + n * l * sigma**4 + sigma**2) dis = (q - 1.) / 2 - sqrt(2.) * (q * 1. / g + 1.) #print dis, s pr = chi2.logsf((dis / s)**2, 8.) / log(2) + log(n / 16., 2) return pr
def LikRatio_test(psi, psi_null, AD, DP, GT_prob, theta, log=False): """Likelihood ratio test for psi vector in a null hypothesis. Please use the same AD, DP, and GT_prob as the fit() function. Parameters ---------- psi: numpy.array (n_donor, ) The fractional abundance of each donor in the mixture for alternative hypothesis psi_null: numpy.array (n_donor, ) The psi vector in a null hypothesis AD: numpy.array, (n_variant, ), int The count vector for alternative allele in all variants DP: numpy.array (n_variant, ), int The count vector for depths in all variants (i.e., two alleles) GT_prob: numpy.array, (n_variants, n_donor, n_GT) The probability tensor for each genotype in each donor theta: numpy.array (n_GT, ) The alternative allele rate in each genotype category log: bool If return p value in logarithm scale Return ------ statistic: float The calculated chi2-statistic. pvalue: float The single-tailed p-value. """ from scipy.stats import chi2 BD = DP - AD theta_vct_alt = np.dot(np.dot(GT_prob, theta), psi) logLik_alt = np.sum(AD * np.log(theta_vct_alt) + BD * np.log(1 - theta_vct_alt)) theta_vct_null = np.dot(np.dot(GT_prob, theta), psi_null) logLik_null = np.sum(AD * np.log(theta_vct_null) + BD * np.log(1 - theta_vct_null)) LR = 2 * (logLik_alt - logLik_null) df = len(psi_null) - 1 if log: pval = chi2.logsf(LR, df) else: pval = chi2.sf(LR, df) return LR, pval
def LR_test(LR, df=1, is_log=False): """Likelihood ratio test Args: LR (np.array): likelihood ratio at log scale between alternative model vs null model, namely logLik difference df (int): degree of freedom in chi square distribution, namely number of additional parameters in alternative model is_log (bool): if return p value at log scale Returns: np.array: p value or log(p value) for single-sided test """ if is_log: return chi2.logsf(2 * LR, df) else: return chi2.sf(2 * LR, df)
def neglog10pval(x,df): return -np.log10(np.e)*chi2.logsf(x,df)
def neglog10pval(x, df): return -np.log10(np.e) * chi2.logsf(x, df)
def p_value(x): k = 4 v = chi2.logsf(-2*x, k) return v
def dm2_to_prob(score, df=len(MAHAL_FEATURES)): with np.errstate(under="ignore"): prob = chi2.logsf(score, df) return np.exp(prob - logsumexp(prob))
# Brute MC n_samples = int(1e6) #ts_vals = [very_simple_ts(norm.rvs(loc=0, scale=1, size=30)) for i in range(n_samples)] #ts_vals_threebin = [three_bin_ts(norm.rvs(loc=0, scale=1, size=30)) for i in tqdm(range(n_samples))] b = brute_low_memory(very_simple_ts, very_simple_transform, 30, ts_vals_range, n=n_samples) b_3b = brute_low_memory(three_bin_ts, very_simple_transform, 30, ts_vals_range, n=n_samples) # Polychord res1_1b, res2-1b = pc(very_simple_ts, very_simple_transform, n_dim=30, observed=observed, n_live=100, file_root="pc_1bin", do_clustering=False, feedback=2, resume=False, ev_data=True) res1_3b, res2_3b = pc(three_bin_ts, very_simple_transform, n_dim=30, observed=observed, n_live=100, file_root="pc_3bin", do_clustering=False, feedback=2, resume=False, ev_data=True) res, test_statistic, log_x, log_x_delta = analyse_pch(root="pc_1bin") res_3b, test_statistic_3b, log_x_3b, log_x_3b_delta = analyse_pch(root="pc_3bin") # analytic in this case log10_local_p = chi2.logsf(ts_vals_range, df=1) / np.log(10) log10_global_p = np.log10(30.) + log10_local_p plt.plot(ts_vals_range, log10_global_p, c='red', ls='--', label="Theory") plt.plot(ts_vals_range, np.log10(b), c='grey', ls='--', label="Brute MC") plt.plot(test_statistic, np.log10(np.exp(log_x)), c='b', label="Polychord") plt.xlim([0,observed]) plt.xlabel('TS') plt.ylabel('$\log_{10}(p)$') plt.legend(title='1-bin example') plt.savefig("simple_ts_onebin.pdf") plt.show() plt.plot(ts_vals_range, np.log10(b_3b), c='grey', ls='--', label="Brute MC") plt.plot(test_statistic_3b, np.log10(np.exp(log_x_3b)), c='b', label="Polychord") plt.xlim([0,observed])
# - output: string "source computer \t pvalue \t mid_pvalue" obtained using Edgington, Fisher, Pearson, \ # George, Stouffer and Tippet methods old_key = "" for line in sys.stdin: ## Obtain the edge and the pvalues key, pvals = line.strip().split("\t") if key != old_key: if old_key != "": ## Edgington p-value (normal approximation - extremely good even for n=4) edgington_pval = norm.logcdf( sqrt(12.0 / n) * (sum_pvals_edg - .5 * n)) mid_edgington_pval = norm.logcdf( sqrt(12.0 / n) * (sum_mid_pvals_edg - .5 * n)) ## Fisher p-value fisher_pval = chi2.logsf(-2 * sum_pvals_fisher, 2 * n) mid_fisher_pval = chi2.logsf(-2 * sum_mid_pvals_fisher, 2 * n) ## Pearson p-value *** CHANGE OF SIGN wrt Biometrika paper *** pearson_pval = chi2.logcdf(2 * sum_pvals_pearson, 2 * n) mid_pearson_pval = chi2.logcdf(2 * sum_mid_pvals_pearson, 2 * n) ## Mudholkar and George p-value (scaled Student's t approximation) george_pval = t.logcdf( sqrt(3.0 / n * (5.0 * n + 4.0) / (5.0 * n + 2.0)) / pi * (sum_pvals_pearson + sum_pvals_fisher), 5 * n + 4) mid_george_pval = t.logcdf( sqrt(3.0 / n * (5.0 * n + 4.0) / (5.0 * n + 2.0)) / pi * (sum_mid_pvals_fisher + sum_mid_pvals_pearson), 5 * n + 4) ## Stouffer's p-value stouffer_pval = norm.logcdf(sum_pvals_stouffer / sqrt(n)) mid_stouffer_pval = norm.logcdf(sum_mid_pvals_stouffer / sqrt(n)) ## Tippett's p-value