def estimate_sigma(observed, df, upper_bound, factor=3, npts=50, nsample=2000): """ Produce an estimate of $\sigma$ from a constrained error sum of squares. The relevant distribution is a scaled $\chi^2$ restricted to $[0,U]$ where $U$ is `upper_bound`. Parameters ---------- observed : float The observed sum of squares. df : float Degrees of freedom of the sum of squares. upper_bound : float Upper limit of truncation interval. factor : float Range of candidate values is [observed/factor, observed*factor] npts : int How many candidate values for interpolator. nsample : int How many samples for each expected value of the truncated sum of squares. Returns ------- sigma_hat : np.float Estimate of $\sigma$. """ values = np.linspace(1. / factor, factor, npts) * observed expected = 0 * values for i, value in enumerate(values): P_upper = chidist.cdf(upper_bound * np.sqrt(df) / value, df) U = np.random.sample(nsample) sample = chidist.ppf(P_upper * U, df) * value expected[i] = np.mean(sample**2) if expected[i] >= 1.1 * (observed**2 * df + observed**2 * df**(0.5)): break interpolant = interp1d(values, expected + df**(0.5) * values**2) V = np.linspace(1. / factor, factor, 10 * npts) * observed # this solves for the solution to # expected(sigma) + sqrt(df) * sigma^2 = observed SS * (1 + sqrt(df)) # the usual "MAP" estimator would have RHS just observed SS # but this factor seems to ``correct it''. # it is such that if there were no selection it would be # the usual unbiased estimate sigma_hat = np.min( V[interpolant(V) >= observed**2 * df + observed**2 * df**(0.5)]) return sigma_hat
def wald_test(tau, Sigma, alpha=0.05, max_condition=1e-6, pval=False): """ Test based on the chi_d distribution. :param tau: observed test statistics (scaled with sqrt(n) :param Sigma: observed covariance matrix :param alpha: level of the test :param max_condition: determines at which threshold eigenvalues are considered as 0 :param pval: if true, returns the conditional p value instead of the test result :return: level of the test """ # instead of regularizing we preprocess Sigma and tau to get rid of 0 eigenvalues tau, Sigma = preprocessing(tau, Sigma, max_condition=max_condition) d = len(tau) # compute matrix inverse Sigma_inv = np.linalg.inv(Sigma) # below quantity is asymptotically standard normal t_obs = np.sqrt(tau @ Sigma_inv @ tau) # compute the 1-alpha quantile of the chi distribution with d degrees of freedom threshold = chi.ppf(q=1-alpha, df=d) if not pval: if t_obs > threshold: return 1 else: return 0 else: # return p value return 1 - chi.cdf(x=t_obs, df=d)
def estimate_sigma(observed, df, upper_bound, factor=3, npts=50, nsample=2000): """ Produce an estimate of $\sigma$ from a constrained error sum of squares. The relevant distribution is a scaled $\chi^2$ restricted to $[0,U]$ where $U$ is `upper_bound`. Parameters ---------- observed : float The observed sum of squares. df : float Degrees of freedom of the sum of squares. upper_bound : float Upper limit of truncation interval. factor : float Range of candidate values is [observed/factor, observed*factor] npts : int How many candidate values for interpolator. nsample : int How many samples for each expected value of the truncated sum of squares. Returns ------- sigma_hat : np.float Estimate of $\sigma$. """ values = np.linspace(1./factor, factor, npts) * observed expected = 0 * values for i, value in enumerate(values): P_upper = chidist.cdf(upper_bound * np.sqrt(df) / value, df) U = np.random.sample(nsample) sample = chidist.ppf(P_upper * U, df) * value expected[i] = np.mean(sample**2) if expected[i] >= 1.1 * (observed**2 * df + observed**2 * df**(0.5)): break interpolant = interp1d(values, expected + df**(0.5) * values**2) V = np.linspace(1./factor,factor,10*npts) * observed # this solves for the solution to # expected(sigma) + sqrt(df) * sigma^2 = observed SS * (1 + sqrt(df)) # the usual "MAP" estimator would have RHS just observed SS # but this factor seems to ``correct it''. # it is such that if there were no selection it would be # the usual unbiased estimate sigma_hat = np.min(V[interpolant(V) >= observed**2 * df + observed**2 * df**(0.5)]) return sigma_hat
def chi_norm(emg_data, params): emg_data = norm_emg(emg_data) emg_data = np.abs(emg_data) #emg_data = norm_emg(emg_data) for chnl in range(len(emg_data)): arg = params[chnl][:-2] loc = params[chnl][-2] scale = params[chnl][-1] a_max = chi.ppf(0.9999999999999999, loc=loc, scale=scale, *arg) a_min = chi.ppf(0.00000000001, loc=loc, scale=scale, *arg) transf = np.clip(emg_data[chnl, :], a_min=a_min, a_max=a_max) transf = chi.cdf(transf, loc=loc, scale=scale, *arg) emg_data[chnl] = norm.ppf(transf) return emg_data
def grcdf(norm, dim): """ Gaussian radial CDF. @type norm: array_like @param norm: norms of the data points @type dim: integer @param dim: dimensionality of the Gaussian """ if dim < 2: return erf(norm / sqrt(2.)) else: return chi.cdf(norm, dim)
def pStar(M, d, mu, sigma, rho, region, regionNumber, nu, omega): gamma = sigma(d, rho) if region == elipsoid: win = [] for i in range(M): T = orthoT(d) v = unitV(d) inRegion = [2*(math.sqrt(nu)/omega)*np.dot(T, vector).item(0) for vector in v if np.dot(T, vector).item(0) >= 0] win.append(sum([chi.cdf(value, d) for value in inRegion])) else: win = [0] for i in range(M): T = orthoT(d) v = unitV(d) Tv = [np.dot(T, vector) for vector in v] for vector in Tv: if (np.array(np.dot(vector, gamma) * (math.sqrt(nu) / omega)) <= 0).all(): win.append(1) return sum(win)/(M*len(v))
def test_chi(self): from scipy.stats import chi import matplotlib.pyplot as plt fig, ax = plt.subplots(1, 1) df = 78 mean, var, skew, kurt = chi.stats(df, moments='mvsk') x = np.linspace(chi.ppf(0.01, df), chi.ppf(0.99, df), 100) ax.plot(x, chi.pdf(x, df), 'r-', lw=5, alpha=0.6, label='chi pdf') rv = chi(df) ax.plot(x, rv.pdf(x), 'k-', lw=2, label='frozen pdf') vals = chi.ppf([0.001, 0.5, 0.999], df) np.allclose([0.001, 0.5, 0.999], chi.cdf(vals, df)) r = chi.rvs(df, size=1000) ax.hist(r, density=True, histtype='stepfilled', alpha=0.2) ax.legend(loc='best', frameon=False) self.assertEqual(str(ax), "AxesSubplot(0.125,0.11;0.775x0.77)")
def estimate_sigma(observed, truncated_df, lower_bound, upper_bound, untruncated_df=0, factor=3, npts=50, nsample=2000): """ Produce an estimate of $\sigma$ from a constrained error sum of squares. The relevant distribution is a scaled $\chi^2$ restricted to $[0,U]$ where $U$ is `upper_bound`. Parameters ---------- observed : float The observed sum of squares. truncated_df : float Degrees of freedom of the truncated $\chi^2$ in the sum of squares. The observed sum is assumed to be the sum of an independent untruncated $\chi^2$ and the truncated one. lower_bound : float Lower limit of truncation interval. upper_bound : float Upper limit of truncation interval. untruncated_df : float Degrees of freedom of the untruncated $\chi^2$ in the sum of squares. factor : float Range of candidate values is [observed/factor, observed*factor] npts : int How many candidate values for interpolator. nsample : int How many samples for each expected value of the truncated sum of squares. Returns ------- sigma_hat : np.float Estimate of $\sigma$. """ if untruncated_df < 50: linear_term = truncated_df**(0.5) else: linear_term = 0 total_df = untruncated_df + truncated_df values = np.linspace(1./factor, factor, npts) * observed expected = 0 * values for i, value in enumerate(values): P_upper = chidist.cdf(upper_bound * np.sqrt(truncated_df) / value, truncated_df) P_lower = chidist.cdf(lower_bound * np.sqrt(truncated_df) / value, truncated_df) U = np.random.sample(nsample) if untruncated_df > 0: sample = (chidist.ppf((P_upper - P_lower) * U + P_lower, truncated_df)**2 + chidist.rvs(untruncated_df, size=nsample)**2) * value**2 else: sample = (chidist.ppf((P_upper - P_lower) * U + P_lower, truncated_df) * value)**2 expected[i] = np.mean(sample) if expected[i] >= 1.5 * (observed**2 * total_df + observed**2 * linear_term): break interpolant = interp1d(values, expected + values**2 * linear_term) V = np.linspace(1./factor,factor,10*npts) * observed # this solves for the solution to # expected(sigma) + sqrt(df) * sigma^2 = observed SS * (1 + sqrt(df)) # the usual "MAP" estimator would have RHS just observed SS # but this factor seems to correct it. # it is such that if there were no selection it would be # the usual unbiased estimate try: sigma_hat = np.min(V[interpolant(V) >= observed**2 * total_df + observed**2 * linear_term]) except ValueError: # no solution, just return observed sigma_hat = observed return sigma_hat
def grcdf(norm, dim): """ Gaussian radial CDF. """ return chi.cdf(norm, dim)
data_path = r'C:\Users\win10\Desktop\Projects\CYB\Experiment_Balint\CYB005\Data' n_channels = 8 X = np.empty((n_channels, 0)) for i, file in enumerate( sorted([f for f in os.listdir(data_path) if f.endswith('.json')])): with open(data_path + '\\' + file) as json_file: dict_data = json.load(json_file) X = np.concatenate((X, dict_data["EMG"]), axis=1) if i >= 9: break print("Loaded") data = X[0, :] data = (data - np.mean(data)) / np.std(data) data = np.abs(data[abs(data - np.mean(data)) < 4 * np.std(data)]) params = chi.fit(data) # Separate parts of parameters arg = params[:-2] loc = params[-2] scale = params[-1] # Calculate fitted PDF and error with fit in distribution transf = chi.cdf(data, loc=loc, scale=scale, *arg) transf = norm.ppf(transf) plt.figure() plt.hist(transf, bins=50) plt.show()
Y_j = B_j_prime.T.dot(C_pca_vec) # project C to PCA Z_j = C_j_prime.dot(C_pca_vec) # Step 3a: standarlization of anomalies Z_j_sd = Z_j.std(axis=0) X_prime = X/Z_j_sd Y_j_prime = Y_j/Z_j_sd for int_loop in range(0, len(dist_id_sel)): # (v2.5 insert C_eigen_val_count as the PCs truncation threshold) distances[int_loop, ref_dist] = np.linalg.norm( X_prime[int_loop, 0:C_eigen_val_count]-Y_j_prime[0:C_eigen_val_count]) print(ref_dist, int_loop, B_ctrl, mdl_c) mdl_c_disp = str(B_ctrl+1) locals()[mdl+"_distance_vals_B"+mdl_c_disp] = distances locals()[mdl+"_distance_pct_B" + mdl_c_disp] = chi.cdf(distances, C_eigen_val_count) percents = locals()[mdl+"_distance_pct_B"+mdl_c_disp] to_excl_vals = pd.DataFrame(distances) to_excl_vals.to_excel(excel_writer=mdl+"_B"+mdl_c_disp+"_vals.xlsx") to_excl_pct = pd.DataFrame(locals()[mdl+"_distance_pct_B"+mdl_c_disp]) to_excl_pct.to_excel(excel_writer=mdl+"_B"+mdl_c_disp+"_pct.xlsx") min_dist_val = np.amin(distances, axis=0) min_dist_ind = [] for min_ind in range(0, min_dist_val.shape[0]): xt = [act for act in percents[:, min_ind] if act <= 0.68] xt_where = np.where(percents[:, min_ind] <= 0.68) analog_ct = len(xt) dist_space = [] dist_space_id = [] if analog_ct == 0: print("no analog, all novel climates", file=f_c_d_na)
def ost_test(tau, Sigma, alpha=0.05, selection='discrete', max_condition=1e-6, accuracy=1e-6, constraints='Sigma', pval=False): """ Runs the full test suggested in our paper. :param tau: observed statistic :param Sigma: covariance matrix :param alpha: level of test :param selection: continuous/discrete (discrete is not extensively tested) :param max_condition: at which condition number the covariance matrix is truncated. :param accuracy: threshold to determine whether an entry is zero :param constraints: if 'Sigma' we work with the constraints (Sigma beta) >=0. If 'positive' we work with beta >= 0 :param pval: if true, returns the conditional p value instead of the test result :return: 1 (reject), 0 (no reject) """ assert constraints == 'Sigma' or constraints == 'positive', 'Constraints are not implemented' # if the selection is discrete we dont want any transformations if selection == 'discrete': constraints = 'positive' # check if there are entries with 0 variance zeros = [i for i in range(len(tau)) if Sigma[i][i] < 1e-10] tau = np.delete(tau, zeros) Sigma = np.delete(Sigma, zeros, 0) Sigma = np.delete(Sigma, zeros, 1) if constraints == 'Sigma': # compute pseudoinverse to also handle singular covariances (see Appendix) r_cond = max_condition # parameter which precision to use Sigma_inv = np.linalg.pinv(Sigma, rcond=r_cond, hermitian=True) # use Remark 1 to convert the problem tau = Sigma_inv @ tau Sigma = Sigma_inv # Apply Theorem 1 in the canonical form with beta>=0 constraints beta_star = optimization(tau=tau, Sigma=Sigma, selection=selection) # determine active set non_zero = [1 if beta_i > accuracy else 0 for beta_i in beta_star] projector = np.diag(non_zero) effective_sigma = projector @ Sigma @ projector # Use the rank of effective Sigma to determine how many degrees of freedom the covariance has after conditioning # for non-singular original covariance, this is the same number as the number of active dimensions |mathcal{U}|, # however, for singular cases using the rank is the right way to go. tol = max_condition * np.max(np.linalg.eigvalsh(Sigma)) r = np.linalg.matrix_rank(effective_sigma, tol=tol, hermitian=True) # go back to notation used in the paper l = r if l > 1: test_statistic = beta_star @ tau / np.sqrt( beta_star @ Sigma @ beta_star) threshold = chi_stats.ppf(q=1 - alpha, df=l) else: vminus = truncation(beta_star=beta_star, tau=tau, Sigma=Sigma, accuracy=accuracy) threshold = truncated_gaussian(var=beta_star @ Sigma @ beta_star, v_minus=vminus, level=alpha) test_statistic = beta_star @ tau if not pval: if test_statistic > threshold: # reject return 1 else: # cannot reject return 0 if pval: if l > 1: test_statistic = beta_star @ tau / np.sqrt( beta_star @ Sigma @ beta_star) pvalue = 1 - chi_stats.cdf(x=test_statistic, df=l) else: test_statistic = beta_star @ tau / np.sqrt( beta_star @ Sigma @ beta_star) vminus = truncation(beta_star=beta_star, tau=tau, Sigma=Sigma, accuracy=accuracy) / \ np.sqrt(beta_star @ Sigma @ beta_star) pvalue = 1 - (norm.cdf(x=test_statistic) - norm.cdf(x=vminus)) / (1 - norm.cdf(x=vminus)) return pvalue
def chi_pvalue(observed, lower_bound, upper_bound, sd, df, method='MC', nsim=1000): r""" Compute a truncated $\chi$ p-value based on the conditional survival function. Parameters ---------- observed : float lower_bound : float upper_bound : float sd : float Standard deviation. df : float Degrees of freedom. method: string One of ['MC', 'cdf', 'sf'] Returns ------- pvalue : float Notes ----- Let $T$ be `observed`, $L$ be `lower_bound` and $U$ be `upper_bound`, and $\sigma$ be `sd`. The p-value, for $L \leq T \leq U$ is .. math:: \frac{P(\chi^2_k / \sigma^2 \geq T^2) - P(\chi^2_k / \sigma^2 \geq U^2)} {P(\chi^2_k / \sigma^2 \geq L^2) - P(\chi^2_k / \sigma^2 \geq U^2)} It can be computed using `scipy.stats.chi` either its `cdf` (distribution function) or `sf` (survival function) or evaluated by Monte Carlo if method is `MC`. """ L, T, U = lower_bound, observed, upper_bound # shorthand if method == 'cdf': pval = ((chi.cdf(U / sd, df) - chi.cdf(T / sd, df)) / (chi.cdf(U / sd, df) - chi.cdf(L / sd, df))) elif method == 'sf': pval = ((chi.sf(U / sd, df) - chi.sf(T / sd, df)) / (chi.sf(U / sd, df) - chi.sf(L / sd, df))) elif method == 'MC': if df == 1: H = [] else: H = [0]*(df-1) pval = general_pvalue(T / sd, L / sd, U / sd, H, nsim=nsim) else: raise ValueError('method should be one of ["cdf", "sf", "MC"]') if pval == 1: # the distribution functions may have failed -- use MC pval = general_pvalue(T / sd, L / sd, U / sd, H, nsim=50000) if pval > 1: pval = 1 return pval
for i, ecdf in enumerate(params): axes.flatten()[i].hist(X1[i, :], bins=100, density=True, rwidth=1, edgecolor=sns.color_palette()[0]) if i % 2 is 0: axes.flatten()[i].set_ylabel("Probability density") if i > 5: axes.flatten()[i].set_xlabel("Normalised voltage") plt.tight_layout() plt.show() plt.show() print(chi.cdf(8.47, loc=params[6][-2], scale=params[6][-1], *params[6][:-2])) # Display n_channels = 8 X = np.empty((n_channels, 0)) for file in sorted([f for f in os.listdir(data_path) if f.endswith('.json')]): with open(data_path + '\\' + file) as json_file: dict_data = json.load(json_file) emg_data = np.array(dict_data["EMG"]) X = np.hstack((X, emg_data)) X_std = np.std(X, axis=1) X_mean = np.mean(X, axis=1) X = (X - X_mean[:, None]) / X_std[:, None] params = list() dist = chi for chnl in range(len(X)):
Y_j = B_j_prime.T.dot(C_pca_vec) # project C to PCA Z_j = C_j_prime.dot(C_pca_vec) ## Step 3a: standarlization of anomalies Z_j_sd = Z_j.std(axis=0) X_prime = X / Z_j_sd Y_j_prime = Y_j / Z_j_sd for int_loop in range(0, len(dist_id_sel)): # (v2.5 insert C_eigen_val_count as the PCs truncation threshold) distances[int_loop, ref_dist] = np.linalg.norm( X_prime[int_loop, 0:C_eigen_val_count] - Y_j_prime[0:C_eigen_val_count]) print(ref_dist, int_loop, B_ctrl, mdl_c) mdl_c_disp = str(B_ctrl + 1) locals()[mdl + "_distance_vals_B" + mdl_c_disp] = distances locals()[mdl + "_distance_pct_B" + mdl_c_disp] = chi.cdf( distances, C_eigen_val_count) percents = locals()[mdl + "_distance_pct_B" + mdl_c_disp] to_excl_vals = pd.DataFrame(distances) to_excl_vals.to_excel(excel_writer=mdl + "_B" + mdl_c_disp + "_vals.xlsx") to_excl_pct = pd.DataFrame(locals()[mdl + "_distance_pct_B" + mdl_c_disp]) to_excl_pct.to_excel(excel_writer=mdl + "_B" + mdl_c_disp + "_pct.xlsx") min_dist_val = np.amin(distances, axis=0) min_dist_ind = [] for min_ind in range(0, min_dist_val.shape[0]): xt = [act for act in percents[:, min_ind] if act <= 0.68] xt_where = np.where(percents[:, min_ind] <= 0.68) map_plotting(ftr=min_ind, alg=xt_where[0]) analog_ct = len(xt)
def get_bin_prob(k, r_grid): cdf = chi.cdf(r_grid, df=k) bin_prob = np.diff(cdf) return bin_prob
# Display the probability density function (``pdf``): x = np.linspace(chi.ppf(0.01, df), chi.ppf(0.99, df), 100) ax.plot(x, chi.pdf(x, df), 'r-', lw=5, alpha=0.6, label='chi pdf') # Alternatively, the distribution object can be called (as a function) # to fix the shape, location and scale parameters. This returns a "frozen" # RV object holding the given parameters fixed. # Freeze the distribution and display the frozen ``pdf``: rv = chi(df) ax.plot(x, rv.pdf(x), 'k-', lw=2, label='frozen pdf') # Check accuracy of ``cdf`` and ``ppf``: vals = chi.ppf([0.001, 0.5, 0.999], df) np.allclose([0.001, 0.5, 0.999], chi.cdf(vals, df)) # True # Generate random numbers: r = chi.rvs(df, size=1000) # And compare the histogram: ax.hist(r, normed=True, histtype='stepfilled', alpha=0.2) ax.legend(loc='best', frameon=False) plt.show()