Пример #1
0
def estimate_sigma(observed, df, upper_bound, factor=3, npts=50, nsample=2000):
    """

    Produce an estimate of $\sigma$ from a constrained
    error sum of squares. The relevant distribution is a
    scaled $\chi^2$ restricted to $[0,U]$ where $U$ is `upper_bound`.

    Parameters
    ----------

    observed : float
        The observed sum of squares.

    df : float
        Degrees of freedom of the sum of squares.

    upper_bound : float
        Upper limit of truncation interval.
    
    factor : float
        Range of candidate values is 
        [observed/factor, observed*factor]

    npts : int
        How many candidate values for interpolator.

    nsample : int
        How many samples for each expected value
        of the truncated sum of squares.

    Returns
    -------

    sigma_hat : np.float
         Estimate of $\sigma$.
    
    """

    values = np.linspace(1. / factor, factor, npts) * observed
    expected = 0 * values
    for i, value in enumerate(values):
        P_upper = chidist.cdf(upper_bound * np.sqrt(df) / value, df)
        U = np.random.sample(nsample)
        sample = chidist.ppf(P_upper * U, df) * value
        expected[i] = np.mean(sample**2)

        if expected[i] >= 1.1 * (observed**2 * df + observed**2 * df**(0.5)):
            break

    interpolant = interp1d(values, expected + df**(0.5) * values**2)
    V = np.linspace(1. / factor, factor, 10 * npts) * observed
    # this solves for the solution to
    # expected(sigma) + sqrt(df) * sigma^2 = observed SS * (1 + sqrt(df))
    # the usual "MAP" estimator would have RHS just observed SS
    # but this factor seems to ``correct it''.
    # it is such that if there were no selection it would be
    # the usual unbiased estimate
    sigma_hat = np.min(
        V[interpolant(V) >= observed**2 * df + observed**2 * df**(0.5)])
    return sigma_hat
Пример #2
0
def wald_test(tau, Sigma, alpha=0.05, max_condition=1e-6, pval=False):
    """
    Test based on the chi_d distribution.
    :param tau: observed test statistics (scaled with sqrt(n)
    :param Sigma: observed covariance matrix
    :param alpha: level of the test
    :param max_condition: determines at which threshold eigenvalues are considered as 0
    :param pval: if true, returns the conditional p value instead of the test result
    :return: level of the test
    """
    # instead of regularizing we preprocess Sigma and tau to get rid of 0 eigenvalues
    tau, Sigma = preprocessing(tau, Sigma, max_condition=max_condition)
    d = len(tau)
    # compute matrix inverse
    Sigma_inv = np.linalg.inv(Sigma)

    # below quantity is asymptotically standard normal
    t_obs = np.sqrt(tau @ Sigma_inv @ tau)

    # compute the 1-alpha quantile of the chi distribution with d degrees of freedom
    threshold = chi.ppf(q=1-alpha, df=d)
    if not pval:
        if t_obs > threshold:
            return 1
        else:
            return 0
    else:
        # return p value
        return 1 - chi.cdf(x=t_obs, df=d)
Пример #3
0
def estimate_sigma(observed, df, upper_bound, factor=3, npts=50, nsample=2000):
    """

    Produce an estimate of $\sigma$ from a constrained
    error sum of squares. The relevant distribution is a
    scaled $\chi^2$ restricted to $[0,U]$ where $U$ is `upper_bound`.

    Parameters
    ----------

    observed : float
        The observed sum of squares.

    df : float
        Degrees of freedom of the sum of squares.

    upper_bound : float
        Upper limit of truncation interval.
    
    factor : float
        Range of candidate values is 
        [observed/factor, observed*factor]

    npts : int
        How many candidate values for interpolator.

    nsample : int
        How many samples for each expected value
        of the truncated sum of squares.

    Returns
    -------

    sigma_hat : np.float
         Estimate of $\sigma$.
    
    """

    values = np.linspace(1./factor, factor, npts) * observed
    expected = 0 * values
    for i, value in enumerate(values):
        P_upper = chidist.cdf(upper_bound * np.sqrt(df) / value, df) 
        U = np.random.sample(nsample)
        sample = chidist.ppf(P_upper * U, df) * value
        expected[i] = np.mean(sample**2) 

        if expected[i] >= 1.1 * (observed**2 * df + observed**2 * df**(0.5)):
            break

    interpolant = interp1d(values, expected + df**(0.5) * values**2)
    V = np.linspace(1./factor,factor,10*npts) * observed
    # this solves for the solution to 
    # expected(sigma) + sqrt(df) * sigma^2 = observed SS * (1 + sqrt(df))
    # the usual "MAP" estimator would have RHS just observed SS
    # but this factor seems to ``correct it''.
    # it is such that if there were no selection it would be 
    # the usual unbiased estimate
    sigma_hat = np.min(V[interpolant(V) >= observed**2 * df + observed**2 * df**(0.5)])
    return sigma_hat
Пример #4
0
def chi_norm(emg_data, params):
    emg_data = norm_emg(emg_data)
    emg_data = np.abs(emg_data)
    #emg_data = norm_emg(emg_data)
    for chnl in range(len(emg_data)):
        arg = params[chnl][:-2]
        loc = params[chnl][-2]
        scale = params[chnl][-1]
        a_max = chi.ppf(0.9999999999999999, loc=loc, scale=scale, *arg)
        a_min = chi.ppf(0.00000000001, loc=loc, scale=scale, *arg)
        transf = np.clip(emg_data[chnl, :], a_min=a_min, a_max=a_max)
        transf = chi.cdf(transf, loc=loc, scale=scale, *arg)
        emg_data[chnl] = norm.ppf(transf)
    return emg_data
Пример #5
0
def grcdf(norm, dim):
    """
	Gaussian radial CDF.
	
	@type  norm: array_like
	@param norm: norms of the data points

	@type  dim: integer
	@param dim: dimensionality of the Gaussian
	"""

    if dim < 2:
        return erf(norm / sqrt(2.))
    else:
        return chi.cdf(norm, dim)
Пример #6
0
def grcdf(norm, dim):
	"""
	Gaussian radial CDF.
	
	@type  norm: array_like
	@param norm: norms of the data points

	@type  dim: integer
	@param dim: dimensionality of the Gaussian
	"""

	if dim < 2:
		return erf(norm / sqrt(2.))
	else:
		return chi.cdf(norm, dim)
Пример #7
0
def pStar(M, d, mu, sigma, rho, region, regionNumber, nu, omega):
    gamma = sigma(d, rho)
    if region == elipsoid:
        win = []
        for i in range(M):
            T = orthoT(d)
            v = unitV(d)
            inRegion = [2*(math.sqrt(nu)/omega)*np.dot(T, vector).item(0) for vector in v if np.dot(T, vector).item(0) >= 0]
            win.append(sum([chi.cdf(value, d) for value in inRegion]))
    else:
        win = [0]
        for i in range(M):
            T = orthoT(d)
            v = unitV(d)
            Tv = [np.dot(T, vector) for vector in v]
            for vector in Tv:
                if (np.array(np.dot(vector, gamma) * (math.sqrt(nu) / omega)) <= 0).all():
                    win.append(1)
    return sum(win)/(M*len(v))
Пример #8
0
    def test_chi(self):
        from scipy.stats import chi
        import matplotlib.pyplot as plt
        fig, ax = plt.subplots(1, 1)

        df = 78
        mean, var, skew, kurt = chi.stats(df, moments='mvsk')

        x = np.linspace(chi.ppf(0.01, df), chi.ppf(0.99, df), 100)
        ax.plot(x, chi.pdf(x, df), 'r-', lw=5, alpha=0.6, label='chi pdf')

        rv = chi(df)
        ax.plot(x, rv.pdf(x), 'k-', lw=2, label='frozen pdf')

        vals = chi.ppf([0.001, 0.5, 0.999], df)
        np.allclose([0.001, 0.5, 0.999], chi.cdf(vals, df))

        r = chi.rvs(df, size=1000)

        ax.hist(r, density=True, histtype='stepfilled', alpha=0.2)
        ax.legend(loc='best', frameon=False)
        self.assertEqual(str(ax), "AxesSubplot(0.125,0.11;0.775x0.77)")
Пример #9
0
def estimate_sigma(observed, truncated_df, lower_bound, upper_bound, untruncated_df=0, factor=3, npts=50, nsample=2000):
    """

    Produce an estimate of $\sigma$ from a constrained
    error sum of squares. The relevant distribution is a
    scaled $\chi^2$ restricted to $[0,U]$ where $U$ is `upper_bound`.

    Parameters
    ----------

    observed : float
        The observed sum of squares.

    truncated_df : float
        Degrees of freedom of the truncated $\chi^2$ in the sum of squares.
        The observed sum is assumed to be the sum
        of an independent untruncated $\chi^2$ and the truncated one.

    lower_bound : float
        Lower limit of truncation interval.
    
    upper_bound : float
        Upper limit of truncation interval.
    
    untruncated_df : float
        Degrees of freedom of the untruncated $\chi^2$ in the sum of squares.

    factor : float
        Range of candidate values is 
        [observed/factor, observed*factor]

    npts : int
        How many candidate values for interpolator.

    nsample : int
        How many samples for each expected value
        of the truncated sum of squares.

    Returns
    -------

    sigma_hat : np.float
         Estimate of $\sigma$.
    
    """

    if untruncated_df < 50:
        linear_term = truncated_df**(0.5)
    else:
        linear_term = 0

    total_df = untruncated_df + truncated_df

    values = np.linspace(1./factor, factor, npts) * observed
    expected = 0 * values
    for i, value in enumerate(values):
        P_upper = chidist.cdf(upper_bound * np.sqrt(truncated_df) / value, truncated_df) 
        P_lower = chidist.cdf(lower_bound * np.sqrt(truncated_df) / value, truncated_df) 
        U = np.random.sample(nsample)
        if untruncated_df > 0:
            sample = (chidist.ppf((P_upper - P_lower) * U + P_lower, truncated_df)**2 + chidist.rvs(untruncated_df, size=nsample)**2) * value**2
        else:
            sample = (chidist.ppf((P_upper - P_lower) * U + P_lower, truncated_df) * value)**2
        expected[i] = np.mean(sample) 

        if expected[i] >= 1.5 * (observed**2 * total_df + observed**2 * linear_term):
            break

    interpolant = interp1d(values, expected + values**2 * linear_term)
    V = np.linspace(1./factor,factor,10*npts) * observed

    # this solves for the solution to 
    # expected(sigma) + sqrt(df) * sigma^2 = observed SS * (1 + sqrt(df))
    # the usual "MAP" estimator would have RHS just observed SS
    # but this factor seems to correct it.
    # it is such that if there were no selection it would be 
    # the usual unbiased estimate

    try:
        sigma_hat = np.min(V[interpolant(V) >= observed**2 * total_df + observed**2 * linear_term])
    except ValueError:
        # no solution, just return observed
        sigma_hat = observed
        
    return sigma_hat
Пример #10
0
def grcdf(norm, dim):
	"""
	Gaussian radial CDF.
	"""

	return chi.cdf(norm, dim)
Пример #11
0
data_path = r'C:\Users\win10\Desktop\Projects\CYB\Experiment_Balint\CYB005\Data'
n_channels = 8
X = np.empty((n_channels, 0))
for i, file in enumerate(
        sorted([f for f in os.listdir(data_path) if f.endswith('.json')])):
    with open(data_path + '\\' + file) as json_file:
        dict_data = json.load(json_file)
        X = np.concatenate((X, dict_data["EMG"]), axis=1)
    if i >= 9:
        break

print("Loaded")
data = X[0, :]
data = (data - np.mean(data)) / np.std(data)
data = np.abs(data[abs(data - np.mean(data)) < 4 * np.std(data)])

params = chi.fit(data)

# Separate parts of parameters
arg = params[:-2]
loc = params[-2]
scale = params[-1]

# Calculate fitted PDF and error with fit in distribution
transf = chi.cdf(data, loc=loc, scale=scale, *arg)
transf = norm.ppf(transf)
plt.figure()
plt.hist(transf, bins=50)

plt.show()
Пример #12
0
     Y_j = B_j_prime.T.dot(C_pca_vec)
     # project C to PCA
     Z_j = C_j_prime.dot(C_pca_vec)
     # Step 3a: standarlization of anomalies
     Z_j_sd = Z_j.std(axis=0)
     X_prime = X/Z_j_sd
     Y_j_prime = Y_j/Z_j_sd
     for int_loop in range(0, len(dist_id_sel)):
         # (v2.5 insert C_eigen_val_count as the PCs truncation threshold)
         distances[int_loop, ref_dist] = np.linalg.norm(
             X_prime[int_loop, 0:C_eigen_val_count]-Y_j_prime[0:C_eigen_val_count])
         print(ref_dist, int_loop, B_ctrl, mdl_c)
 mdl_c_disp = str(B_ctrl+1)
 locals()[mdl+"_distance_vals_B"+mdl_c_disp] = distances
 locals()[mdl+"_distance_pct_B" +
          mdl_c_disp] = chi.cdf(distances, C_eigen_val_count)
 percents = locals()[mdl+"_distance_pct_B"+mdl_c_disp]
 to_excl_vals = pd.DataFrame(distances)
 to_excl_vals.to_excel(excel_writer=mdl+"_B"+mdl_c_disp+"_vals.xlsx")
 to_excl_pct = pd.DataFrame(locals()[mdl+"_distance_pct_B"+mdl_c_disp])
 to_excl_pct.to_excel(excel_writer=mdl+"_B"+mdl_c_disp+"_pct.xlsx")
 min_dist_val = np.amin(distances, axis=0)
 min_dist_ind = []
 for min_ind in range(0, min_dist_val.shape[0]):
     xt = [act for act in percents[:, min_ind] if act <= 0.68]
     xt_where = np.where(percents[:, min_ind] <= 0.68)
     analog_ct = len(xt)
     dist_space = []
     dist_space_id = []
     if analog_ct == 0:
         print("no analog, all novel climates", file=f_c_d_na)
Пример #13
0
def ost_test(tau,
             Sigma,
             alpha=0.05,
             selection='discrete',
             max_condition=1e-6,
             accuracy=1e-6,
             constraints='Sigma',
             pval=False):
    """
    Runs the full test suggested in our paper.
    :param tau: observed statistic
    :param Sigma: covariance matrix
    :param alpha: level of test
    :param selection: continuous/discrete (discrete is not extensively tested)
    :param max_condition: at which condition number the covariance matrix is truncated.
    :param accuracy: threshold to determine whether an entry is zero
    :param constraints: if 'Sigma'  we work with the constraints (Sigma beta) >=0. If 'positive' we work with beta >= 0
    :param pval: if true, returns the conditional p value instead of the test result
    :return: 1 (reject), 0 (no reject)
    """
    assert constraints == 'Sigma' or constraints == 'positive', 'Constraints are not implemented'
    # if the selection is discrete we dont want any transformations
    if selection == 'discrete':
        constraints = 'positive'

    # check if there are entries with 0 variance
    zeros = [i for i in range(len(tau)) if Sigma[i][i] < 1e-10]
    tau = np.delete(tau, zeros)
    Sigma = np.delete(Sigma, zeros, 0)
    Sigma = np.delete(Sigma, zeros, 1)

    if constraints == 'Sigma':
        # compute pseudoinverse to also handle singular covariances (see Appendix)
        r_cond = max_condition  # parameter which precision to use
        Sigma_inv = np.linalg.pinv(Sigma, rcond=r_cond, hermitian=True)

        # use Remark 1 to convert the problem
        tau = Sigma_inv @ tau
        Sigma = Sigma_inv

    # Apply Theorem 1 in the canonical form with beta>=0 constraints
    beta_star = optimization(tau=tau, Sigma=Sigma, selection=selection)

    # determine active set
    non_zero = [1 if beta_i > accuracy else 0 for beta_i in beta_star]

    projector = np.diag(non_zero)
    effective_sigma = projector @ Sigma @ projector

    # Use the rank of effective Sigma to determine how many degrees of freedom the covariance has after conditioning
    # for non-singular original covariance, this is the same number as the number of active dimensions |mathcal{U}|,
    # however, for singular cases using the rank is the right way to go.
    tol = max_condition * np.max(np.linalg.eigvalsh(Sigma))
    r = np.linalg.matrix_rank(effective_sigma, tol=tol, hermitian=True)
    # go back to notation used in the paper
    l = r
    if l > 1:
        test_statistic = beta_star @ tau / np.sqrt(
            beta_star @ Sigma @ beta_star)
        threshold = chi_stats.ppf(q=1 - alpha, df=l)
    else:
        vminus = truncation(beta_star=beta_star,
                            tau=tau,
                            Sigma=Sigma,
                            accuracy=accuracy)
        threshold = truncated_gaussian(var=beta_star @ Sigma @ beta_star,
                                       v_minus=vminus,
                                       level=alpha)
        test_statistic = beta_star @ tau
    if not pval:
        if test_statistic > threshold:
            # reject
            return 1
        else:
            # cannot reject
            return 0
    if pval:
        if l > 1:
            test_statistic = beta_star @ tau / np.sqrt(
                beta_star @ Sigma @ beta_star)
            pvalue = 1 - chi_stats.cdf(x=test_statistic, df=l)
        else:
            test_statistic = beta_star @ tau / np.sqrt(
                beta_star @ Sigma @ beta_star)
            vminus = truncation(beta_star=beta_star, tau=tau, Sigma=Sigma, accuracy=accuracy) / \
                     np.sqrt(beta_star @ Sigma @ beta_star)
            pvalue = 1 - (norm.cdf(x=test_statistic) -
                          norm.cdf(x=vminus)) / (1 - norm.cdf(x=vminus))
        return pvalue
Пример #14
0
def grcdf(norm, dim):
    """
	Gaussian radial CDF.
	"""

    return chi.cdf(norm, dim)
Пример #15
0
def chi_pvalue(observed, lower_bound, upper_bound, sd, df, method='MC', nsim=1000):
    r"""

    Compute a truncated $\chi$ p-value based on the 
    conditional survival function. 

    Parameters
    ----------

    observed : float

    lower_bound : float

    upper_bound : float

    sd : float
        Standard deviation.

    df : float
        Degrees of freedom.

    method: string
        One of ['MC', 'cdf', 'sf']

    Returns
    -------

    pvalue : float

    Notes
    -----

    Let $T$ be `observed`, $L$ be `lower_bound` and $U$ be `upper_bound`,
    and $\sigma$ be `sd`.
    The p-value, for $L \leq T \leq U$ is

    .. math::

         \frac{P(\chi^2_k / \sigma^2 \geq T^2) - P(\chi^2_k / \sigma^2 \geq U^2)}
         {P(\chi^2_k / \sigma^2 \geq L^2) - P(\chi^2_k / \sigma^2 \geq U^2)} 

    It can be computed using `scipy.stats.chi` either its `cdf` (distribution 
    function) or `sf` (survival function) or evaluated
    by Monte Carlo if method is `MC`.

    """

    L, T, U = lower_bound, observed, upper_bound # shorthand

    if method == 'cdf':
        pval = ((chi.cdf(U / sd, df) - chi.cdf(T / sd, df)) / 
                (chi.cdf(U / sd, df) - chi.cdf(L / sd, df)))
    elif method == 'sf':
        pval = ((chi.sf(U / sd, df) - chi.sf(T / sd, df)) / 
                (chi.sf(U / sd, df) - chi.sf(L / sd, df)))
    elif method == 'MC':
        if df == 1:
            H = []
        else:
            H = [0]*(df-1)
        pval = general_pvalue(T / sd, L / sd, U / sd, H, nsim=nsim)
    else:
        raise ValueError('method should be one of ["cdf", "sf", "MC"]')
    if pval == 1: # the distribution functions may have failed -- use MC
        pval = general_pvalue(T / sd, L / sd, U / sd, H, nsim=50000)
    if pval > 1:
        pval = 1
    return pval
Пример #16
0
for i, ecdf in enumerate(params):
    axes.flatten()[i].hist(X1[i, :],
                           bins=100,
                           density=True,
                           rwidth=1,
                           edgecolor=sns.color_palette()[0])
    if i % 2 is 0:
        axes.flatten()[i].set_ylabel("Probability density")
    if i > 5:
        axes.flatten()[i].set_xlabel("Normalised voltage")
plt.tight_layout()
plt.show()

plt.show()

print(chi.cdf(8.47, loc=params[6][-2], scale=params[6][-1], *params[6][:-2]))
# Display

n_channels = 8
X = np.empty((n_channels, 0))
for file in sorted([f for f in os.listdir(data_path) if f.endswith('.json')]):
    with open(data_path + '\\' + file) as json_file:
        dict_data = json.load(json_file)
    emg_data = np.array(dict_data["EMG"])
    X = np.hstack((X, emg_data))
X_std = np.std(X, axis=1)
X_mean = np.mean(X, axis=1)
X = (X - X_mean[:, None]) / X_std[:, None]
params = list()
dist = chi
for chnl in range(len(X)):
Пример #17
0
     Y_j = B_j_prime.T.dot(C_pca_vec)
     # project C to PCA
     Z_j = C_j_prime.dot(C_pca_vec)
     ## Step 3a: standarlization of anomalies
     Z_j_sd = Z_j.std(axis=0)
     X_prime = X / Z_j_sd
     Y_j_prime = Y_j / Z_j_sd
     for int_loop in range(0, len(dist_id_sel)):
         # (v2.5 insert C_eigen_val_count as the PCs truncation threshold)
         distances[int_loop, ref_dist] = np.linalg.norm(
             X_prime[int_loop, 0:C_eigen_val_count] -
             Y_j_prime[0:C_eigen_val_count])
         print(ref_dist, int_loop, B_ctrl, mdl_c)
 mdl_c_disp = str(B_ctrl + 1)
 locals()[mdl + "_distance_vals_B" + mdl_c_disp] = distances
 locals()[mdl + "_distance_pct_B" + mdl_c_disp] = chi.cdf(
     distances, C_eigen_val_count)
 percents = locals()[mdl + "_distance_pct_B" + mdl_c_disp]
 to_excl_vals = pd.DataFrame(distances)
 to_excl_vals.to_excel(excel_writer=mdl + "_B" + mdl_c_disp +
                       "_vals.xlsx")
 to_excl_pct = pd.DataFrame(locals()[mdl + "_distance_pct_B" +
                                     mdl_c_disp])
 to_excl_pct.to_excel(excel_writer=mdl + "_B" + mdl_c_disp +
                      "_pct.xlsx")
 min_dist_val = np.amin(distances, axis=0)
 min_dist_ind = []
 for min_ind in range(0, min_dist_val.shape[0]):
     xt = [act for act in percents[:, min_ind] if act <= 0.68]
     xt_where = np.where(percents[:, min_ind] <= 0.68)
     map_plotting(ftr=min_ind, alg=xt_where[0])
     analog_ct = len(xt)
Пример #18
0
def get_bin_prob(k, r_grid):
    cdf = chi.cdf(r_grid, df=k)
    bin_prob = np.diff(cdf)
    return bin_prob
Пример #19
0
# Display the probability density function (``pdf``):

x = np.linspace(chi.ppf(0.01, df), chi.ppf(0.99, df), 100)
ax.plot(x, chi.pdf(x, df), 'r-', lw=5, alpha=0.6, label='chi pdf')

# Alternatively, the distribution object can be called (as a function)
# to fix the shape, location and scale parameters. This returns a "frozen"
# RV object holding the given parameters fixed.

# Freeze the distribution and display the frozen ``pdf``:

rv = chi(df)
ax.plot(x, rv.pdf(x), 'k-', lw=2, label='frozen pdf')

# Check accuracy of ``cdf`` and ``ppf``:

vals = chi.ppf([0.001, 0.5, 0.999], df)
np.allclose([0.001, 0.5, 0.999], chi.cdf(vals, df))
# True

# Generate random numbers:

r = chi.rvs(df, size=1000)

# And compare the histogram:

ax.hist(r, normed=True, histtype='stepfilled', alpha=0.2)
ax.legend(loc='best', frameon=False)
plt.show()