def non_central_chi_squared_polynomial_approximation_timing(): """ We assess the speed of a piecewise polynomial approximation to the non-central chi-squared distribution. """ nus = [1, 5, 10, 50] lambdas = [1, 5, 10, 50] n = 10000 res = {} for nu in nus: res[nu] = {} ncx2_approx = construct_inverse_non_central_chi_squared_interpolated_polynomial_approximation( nu) for l in lambdas: u = uniform.rvs(size=n) start = timer() ncx2.ppf(u, df=nu, nc=l) elapsed_ncx2 = (timer() - start) / n start = timer() ncx2_approx(u, non_centrality=l) elapsed_norm = (timer() - start) / n res[nu][l] = round((elapsed_ncx2 / elapsed_norm), 1) df = pd.DataFrame(res) df.index = df.index.rename('lambda') df.columns = df.columns.rename('nu') print(df)
def fit(self, X, y): if self.gpu: X = X.cuda() assert (np.logical_or(y == 0, y == 1)).all() # Only binary-classification now. self.train_X = X self.train_y = y self.Xdim = X.shape[1] # Determine the bucket boundaries. lb = ncx2.ppf(1e-4, self.Xdim * self.bucket_shrink, 0) ub = ncx2.ppf(1 - 1e-4, self.Xdim, self.Xdim * self.bucket_shrink / self.sigma2) self.buckets = np.linspace(lb, ub, num=self.N_bucket) * self.sigma2
def asymptotic_p_value(asimov_q, use_median_rather_than_asimov=False): if use_median_rather_than_asimov: median_q = ncx2.ppf(0.5, df=2, nc=max(0., asimov_q)) p_value = chi2.sf(median_q, df=2) else: p_value = chi2.sf(asimov_q, df=2) return p_value
def qchisq(p,df,ncp=0): """ Calculates the quantile function of the chi-square distribution """ from scipy.stats import chi2,ncx2 if ncp==0: result=chi2.ppf(q=p,df=df,loc=0,scale=1) else: result=ncx2.ppf(q=p,df=df,nc=ncp,loc=0,scale=1) return result
def plot_non_central_chi_squared_polynomial_approximation( savefig=False, plot_from_json=True): if plot_from_json: with open('non_central_chi_squared_linear_approximation.json', "r") as input_file: results = json.load(input_file) results = { k: {x: {float(u): w for u, w in y.items()} for x, y in v.items()} for k, v in results.items() } else: dof = 1.0 ncx2_approx = construct_inverse_non_central_chi_squared_interpolated_polynomial_approximation( dof, n_intervals=4 + 1) u = np.concatenate([ np.linspace(0.0, 1.0, 1000)[:-1], np.logspace(-10, -1, 100), 1.0 - np.logspace(-10, -1, 100) ]) u.sort() non_centralities = [1.0, 10.0, 20.0] results = {non_centrality: {} for non_centrality in non_centralities} for non_centrality in results: exact, approximate = ncx2.ppf(u, df=dof, nc=non_centrality), ncx2_approx( u, non_centrality=non_centrality) results[non_centrality]['exact'] = {x: y for x, y in zip(u, exact)} results[non_centrality]['approximate'] = { x: y for x, y in zip(u, approximate) } plt.clf() for non_centrality in results: exact, approximate = results[non_centrality]['exact'], results[ non_centrality]['approximate'] plt.plot(*zip(*exact.items()), 'k--') plt.plot(*zip(*approximate.items()), 'k,') plt.plot([], [], 'k--', label=r'$C^{-1}_{\nu}(x;\lambda)$') plt.plot([], [], 'k-', label=r'$\tilde{C}^{-1}_{\nu}(x;\lambda)$') plt.ylim(0, 50) plt.yticks([i for i in range(0, 51, 10)]) plt.xticks([0, 1]) plt.xlabel(r'$x$') plt.legend(frameon=False) if savefig: plt.savefig('non_central_chi_squared_linear_approximation.pdf', format='pdf', bbox_inches='tight', transparent=True) if not plot_from_json: with open('non_central_chi_squared_linear_approximation.json', "w") as output_file: output_file.write(json.dumps(results, indent=4))
def predict_lambda_and_percentiles(self, Xnew, lower=5, upper=95): """ Computes mean value of intensity and lower and upper percentiles. `lower` and `upper` must be between 0 and 100. """ # f ~ Normal(mean_f, var_f) mean_f, var_f = self.predict_f(Xnew) # λ = E[f²] = E[f]² + Var[f] lambda_mean = mean_f**2 + var_f # g = f/√var_f ~ Normal(mean_f/√var_f, 1) # g² = f²/var_f ~ χ²(k=1, λ=mean_f²/var_f) non-central chi-squared m2ov = mean_f**2 / var_f if tf.reduce_any(m2ov > 10e3): raise ValueError("scipy.stats.ncx2.ppf() flatlines for nc > 10e3") f2ov_lower = ncx2.ppf(lower / 100, df=1, nc=m2ov) f2ov_upper = ncx2.ppf(upper / 100, df=1, nc=m2ov) # f² = g² * var_f lambda_lower = f2ov_lower * var_f lambda_upper = f2ov_upper * var_f return lambda_mean, lambda_lower, lambda_upper
def plot_non_central_chi_squared_polynomial_approximation(save_figure=False): """ Plots a polynomial approximation to the non-central chi-squared. """ u = linspace(0.0, 1.0, 10000)[:-1] # Excluding the end points. dof = 1.0 non_centralities = [1.0, 10.0, 20.0] clear_plot() for non_centrality in non_centralities: ncx2_approx = construct_inverse_non_central_chi_squared_interpolated_polynomial_approximation( dof, n_intervals=4) plot(u, ncx2.ppf(u, df=dof, nc=non_centrality), 'k--') plot(u, ncx2_approx(u, non_centrality=non_centrality), 'k,') savefig( 'piecewise_polynomial_approximation_of_non_central_chi_squared.pdf', format='pdf', bbox_inches='tight', transparent=True)
def rmse_of_non_central_chi_squared_polynomial_approximations(): lambdas = [1, 5, 10, 50, 100, 200] nus = [1, 5, 10, 50, 100] poly_orders = [1, 3, 5] n_intervals = 16 results = { poly_order: {nu: {} for nu in nus} for poly_order in poly_orders } for poly_order in poly_orders: for nu in nus: ncx2_approx = construct_inverse_non_central_chi_squared_interpolated_polynomial_approximation( dof=nu, n_intervals=n_intervals + 1, polynomial_order=poly_order) discontinuities = sorted( [0.5**(i + 2) for i in range(n_intervals)] + [0.5] + [1.0 - 0.5**(i + 2) for i in range(n_intervals)]) for l in lambdas: rmse = integrate(lambda u: (ncx2.ppf( u, df=nu, nc=l) - ncx2_approx(u, non_centrality=l))**2, 0, 1, points=discontinuities, limit=50 + 10 * len(discontinuities))[0]**0.5 results[poly_order][nu][l] = rmse for poly_order, result in results.items(): df = pd.DataFrame(result) df.index = df.index.rename('lambda') df.columns = df.columns.rename('nu') print(poly_order, df.min().min(), df.max().max()) print(round(df, 3)) print('\n') print( round(df, 3).apply(lambda x: ' & '.join([str(i) for i in list(x)]) + r' \\', axis=1)) print('\n' * 3)
def produce_cox_ingersoll_ross_paths(dt, approximations=None, **kwargs): assert isinstance( dt, float) and np.isfinite(dt) and dt > 0 and (1.0 / dt).is_integer() assert approximations is not None # The parameters. params = kwargs kappa, theta, sigma = params['kappa'], params['theta'], params['sigma'] T = 1.0 x_0 = 1.0 dt = dt * T sqrt_t = dt**0.5 c1 = 4.0 * kappa / (sigma**2 * (1.0 - np.exp(-kappa * dt))) c2 = c1 * np.exp(-kappa * dt) df = 4.0 * kappa * theta / (sigma**2) euler_maruyama_update = lambda x, w, t: x + kappa * ( theta - x) * t + sigma * np.sqrt(np.fabs(x)) * w exact_update = lambda u, x: ncx2.ppf(u, df=df, nc=x * c2) / c1 approximate_update = lambda u, x, approx: approx(u, non_centrality=x * c2)[ 0] / c1 x_exact = x_0 x_euler_maruyama = x_0 x_approximations = [x_0] * len(approximations) n_increments = int(1.0 / dt) for n in range(n_increments): u = np.random.uniform() z = norm.ppf(u) dw = sqrt_t * z x_euler_maruyama = euler_maruyama_update(x_euler_maruyama, dw, dt) x_exact = exact_update(u, x_exact) x_approximations = [ approximate_update(u, x_approximate, approx) for approx, x_approximate in zip(approximations, x_approximations) ] return [x_euler_maruyama, x_exact, *x_approximations]
def construct_inverse_non_central_chi_squared_interpolated_polynomial_approximation(dof, polynomial_order=1, n_intervals=16, n_interpolating_functions=16): """ Computes a polynomial approximation to the inverse cumulative distribution function for the non-central chi-squared distribution for a fixed number of degrees of freedom. The approximation is parametrised by a non-central parameter :param dof: Float. :param polynomial_order: Int. :param n_intervals: Int. :param n_interpolating_functions: Int. :return: Function. """ interpolation_function = lambda f: f ** 0.5 interpolation_function_deriv_first = lambda f: 0.5 * f ** -0.5 interpolation_function_deriv_second = lambda f: -0.25 * f ** -1.5 # We approximate the function P interpolation_function_contour_spacing = 1.0 / (n_interpolating_functions - 1) interpolation_values = ([interpolation_function(1.0) - n * interpolation_function_contour_spacing for n in range(n_interpolating_functions - 1)] + [interpolation_function(0)])[::-1] # interpolation key values interpolation_points = [0.0] + [root_scalar(lambda a: interpolation_function(a) - y, x0=0.5, bracket=[0.0, 1.0], fprime=interpolation_function_deriv_first, fprime2=interpolation_function_deriv_second).root for y in interpolation_values[1:-1]] + [1.0] # non-centrality for interpolating functions functions_exact = [None] * n_interpolating_functions # The exact functions functions_exact[0] = norm.ppf # Limiting case as y -> 0 # The following odd syntax with y=... ensures y is evaluated at declaration and not taken by reference: functions_exact[1:-1] = [lambda u, y=y_interpolation_points: np.sqrt(dof / (4.0 * y)) * (y / dof * ncx2.ppf(u, df=dof, nc=(1.0 - y) * dof / y) - 1.0) for y_interpolation_points in interpolation_points[1:-1]] functions_exact[-1] = lambda u: np.sqrt(dof / 4.0) * (1.0 / dof * chi2.ppf(u, df=dof) - 1.0) functions_approx = [dyadic_function_approximation_constructor(f, n_intervals, polynomial_order) for f in progressbar(functions_exact)] # By piecewise dyadic construction def construct_linear_interpolation(functions, weightings): """ Builds a linear interpolation between two functions. :param functions: List. :param weightings: List. :return: Function. """ f1, f2 = functions w1, w2 = weightings return lambda u: f1(u) * w1 + f2(u) * w2 def get_interpolation_functions_and_weightings(non_centrality): """ Determines the interpolation functions to use and their weights. :param non_centrality: Float. :return: List. """ interpolation_value = interpolation_function(non_centrality) insertion_index = bisect(interpolation_values, interpolation_value, lo=0) lower_index, upper_index = insertion_index - 1, insertion_index assert lower_index >= 0 assert upper_index <= len(interpolation_values) if upper_index == len(interpolation_values): return [[functions_approx[lower_index]] * 2, [1.0, 0.0]] functions = [functions_approx[i] for i in [lower_index, upper_index]] interpolation_lower, interpolation_upper = [interpolation_values[i] for i in [lower_index, upper_index]] w_lower = (interpolation_upper - interpolation_value) / (interpolation_upper - interpolation_lower) w_upper = 1.0 - w_lower weights = [w_lower, w_upper] return [functions, weights] def inverse_non_central_chi_squared_interpolated_polynomial_approximation(u, non_centrality): """ Polynomial approximation to the inverse cumulative distribution function for the non-central chi-squared distribution :param u: Array. :param non_centrality: Float. :return: Array. """ functions, weightings = get_interpolation_functions_and_weightings(dof / (non_centrality + dof)) interpolated_function = construct_linear_interpolation(functions, weightings) return non_centrality + dof + 2.0 * np.sqrt(non_centrality + dof) * interpolated_function(u) return inverse_non_central_chi_squared_interpolated_polynomial_approximation
import numpy as np from scipy.stats import ncx2 from scipy.stats import norm from scipy.stats import chi2 import matplotlib.pyplot as plt import matplotlib as mpl mpl.rcParams['figure.dpi'] = 300 #%% df = 10_000 theta = 2 nc = np.sqrt(2 * df) * theta x = np.linspace(ncx2.ppf(0.001, df, nc), ncx2.ppf(0.999, df, nc), 1000) plt.plot(x, ncx2.pdf(x, df, nc), label='theta = {:.2f}'.format(theta)) plt.legend() plt.show() #%% def sample_nc2x(theta=0, df=100, size=1000): nc = theta * np.sqrt(2 * df) Y = chi2.rvs(df - 1, size=size) Z = norm.rvs(size=size) Y += nc + 2 * np.sqrt(nc) * Z + Z**2 return Y
Comparison of the non-central chi-squared to the Gaussian. """ import pandas as pd from scipy.stats import uniform, ncx2, norm from timeit import default_timer as timer if __name__ == '__main__': nus = [1, 5, 10, 50] lambdas = [1, 5, 10, 50, 100, 200] n = 10000 res = {} for nu in nus: res[nu] = {} for l in lambdas: u = uniform.rvs(size=n) start = timer() ncx2.ppf(u, df=nu, nc=l) elapsed_ncx2 = (timer() - start) / n start = timer() norm.ppf(u) elapsed_norm = (timer() - start) / n res[nu][l] = int(elapsed_ncx2 / elapsed_norm) df = pd.DataFrame(res) df.index = df.index.rename('lambda') df.columns = df.columns.rename('nu') print df
linewidth=0.8) ax_cdf = ax.twinx() ax_cdf.set_ylabel('cdf') ax.set_xlim((0, 40)) ax.xaxis.set_major_locator(MultipleLocator(5)) ax.xaxis.set_minor_locator(MultipleLocator(1)) xlim = ax.get_xlim() ax_cdf.hist(random, density=True, bins=cum_bins, cumulative=True, histtype='step', color='black', linewidth=0.6) ax_cdf.plot(x, ncx2.cdf(x, df, nc), c='red', linewidth=0.8) cl_95 = ncx2.ppf(0.954, df, nc) cl_99 = ncx2.ppf(0.997, df, nc) ax_cdf.plot((cl_95, cl_95), (0, 1), c='black', linestyle='-.', linewidth=0.5) ax_cdf.plot((cl_99, cl_99), (0, 1), c='black', linestyle='--', linewidth=0.5) ax_cdf.set_ylim((0, 1)) ax_cdf.text(cl_95, 0.5, '$2\sigma$', rotation=90) ax_cdf.text(cl_99, 0.5, '$3\sigma$', rotation=90) # convert the actual data/random numbers into a pdf and cfd! # n, bins is the data and bin boundaries, respectively # works, but does not look so good! need improvement on optics data_rv = rv_histogram((n, bins)) fig_2 = plt.figure(2, dpi=150) ax_2 = fig_2.add_subplot(111) ax_2.xaxis.set_major_locator(MultipleLocator(5)) ax_2.xaxis.set_minor_locator(MultipleLocator(1))
from scipy.stats import ncx2 import matplotlib.pyplot as plt fig, ax = plt.subplots(1, 1) # Calculate a few first moments: df, nc = 21, 1.06 mean, var, skew, kurt = ncx2.stats(df, nc, moments='mvsk') # Display the probability density function (``pdf``): x = np.linspace(ncx2.ppf(0.01, df, nc), ncx2.ppf(0.99, df, nc), 100) ax.plot(x, ncx2.pdf(x, df, nc), 'r-', lw=5, alpha=0.6, label='ncx2 pdf') # Alternatively, the distribution object can be called (as a function) # to fix the shape, location and scale parameters. This returns a "frozen" # RV object holding the given parameters fixed. # Freeze the distribution and display the frozen ``pdf``: rv = ncx2(df, nc) ax.plot(x, rv.pdf(x), 'k-', lw=2, label='frozen pdf') # Check accuracy of ``cdf`` and ``ppf``: vals = ncx2.ppf([0.001, 0.5, 0.999], df, nc) np.allclose([0.001, 0.5, 0.999], ncx2.cdf(vals, df, nc)) # True # Generate random numbers:
def docalc(args, data, len_data, sims, len_sims, error): """ # Fitness Calculation Template: if set(args.error).issuperset(set(['the-acronysm'])): 1. func = 0 2. func = an algebraic expression combining the data average (data_avrg), data standard deviation (data_stdv), simulation average (sims_stdv), simulation standard deviation (sims_stdv), single experimental files (data.loc[i]), and/or simulation files (sims.loc[j]) Note1: Perform two for-loops if using data.loc[i] and sims.loc[j]. Note2: Please consider these variables are DataFrames, meaning that multiplication and division are methods (e.g. df1.division(df2)) 3. Drop NaN values (from experimental time points without simulated values, or simulated values without experimental data) with dropna(axis = 0, how = 'all').dropna(axis = 1, how = 'all'). Also transform Inf values with replace([numpy.inf, -numpy.inf], numpy.nan) 4. Sum the two dimensions, and return a 6 float points scientific notation number (0 float points for statistical tests): error['the-acronysm'] = '{:.6e}'.format(func.dropna(axis = 0, how = 'all').dropna(axis = 1, how = 'all').sum().sum()) """ if args.do_all: args.error = [ 'SDA', 'ADA', 'SSQ', 'CHISQ', 'MNSE', 'PWSD', 'APWSD', 'NPWSD', 'ANPWSD', 'MWUT', 'WMWET', 'TOST', 'DUT' ] """ SDA : Squared Difference of Averages ADA : Absolute Difference of Averages SSQ : Sum of SQuares CHISQ : Chi-Square (Differences divided by data standard deviation) MNSE : Mean Normalized Square Error (Differences divided by data average) PWSD : Pair-Wise Square Deviation APWSD : Absolute Pair-Wise Deviation NPWSD : Normalized Pair-Wise Square Deviation ANPWSD : Absolute Normalized Pair-Wise Deviation MWUT : Mann-Whitney U-test (Mann and Whitney, 1947, DOI 10.1214/aoms/1177730491) WMWET : Wellek's Mann-Whitney Equivalence Test (Wellek 1996, DOI 10.1002/bimj.4710380608) TOST : Two one-sided t-tests (Dunnet and Gent, 1977, DOI 10.2307/2529457, as well other authors) DUT : Double Mann-Whitney U-tests (Reviewed in Cornell, 1990, DOI 10.1080/03610929008830433) More information in https://pleione.readthedocs.io/en/latest/ObjectiveFunctions.html """ data_avrg = doavrg(data, len_data) data_stdv = dostdv(data, len_data) sims_avrg = doavrg(sims, len_sims) sims_stdv = dostdv(sims, len_sims) # former mean square error, now square difference of means if set(args.error).issuperset(set(['SDA'])) or set(args.error).issuperset( set(['MSE'])): func = 0 if not args.do_all: data_avrg = doavrg(data, len_data) sims_avrg = doavrg(sims, len_sims) func = (data_avrg - sims_avrg)**2 error['SDA'] = '{:.6e}'.format( func.dropna(axis=0, how='all').dropna(axis=1, how='all').sum().sum()) # former mean absolute error, now absolute value of the difference of means if set(args.error).issuperset(set(['ADA'])) or set(args.error).issuperset( set(['MAE'])): func = 0 if not args.do_all: data_avrg = doavrg(data, len_data) sims_avrg = doavrg(sims, len_sims) func = abs(data_avrg - sims_avrg) error['ADA'] = '{:.6e}'.format( func.dropna(axis=0, how='all').dropna(axis=1, how='all').sum().sum()) # sum of squares (from BioNetFit paper) if set(args.error).issuperset(set(['SSQ'])): func = 0 for i in range(len_data): for j in range(len_sims): func += (data.loc[i] - sims.loc[j])**2 error['SSQ'] = '{:.6e}'.format( func.dropna(axis=0, how='all').dropna(axis=1, how='all').sum().sum()) # chi-square (from BioNetFit paper) if set(args.error).issuperset(set(['CHISQ'])): func = 0 if not args.do_all: data_stdv = dostdv(data, len_data) for i in range(len_data): for j in range(len_sims): func += ((data.loc[i] - sims.loc[j]).divide(data_stdv))**2 error['CHISQ'] = '{:.6e}'.format( func.dropna(axis=0, how='all').dropna(axis=1, how='all').sum().sum()) # mean normalized square error (from BioNetFit paper) if set(args.error).issuperset(set(['MNSE'])): func = 0 if not args.do_all: data_avrg = doavrg(data, len_data) for i in range(len_data): for j in range(len_sims): func += ((data.loc[i] - sims.loc[j]).divide(data_avrg))**2 error['MNSE'] = '{:.6e}'.format( func.replace([numpy.inf, -numpy.inf], numpy.nan).dropna( axis=0, how='all').dropna(axis=1, how='all').sum().sum()) # pair-wise square deviation if set(args.error).issuperset(set(['PWSD'])): func = 0 for i in range(len_data): for j in range(len_sims): func += ((data.loc[i] - sims.loc[j])**2).divide(len_data * len_sims) error['PWSD'] = '{:.6e}'.format( func.dropna(axis=0, how='all').dropna(axis=1, how='all').sum().sum()) # pair-wise absolute deviation if set(args.error).issuperset(set(['APWSD'])): func = 0 for i in range(len_data): for j in range(len_sims): func += (abs(data.loc[i] - sims.loc[j])).divide(len_data * len_sims) error['APWSD'] = '{:.6e}'.format( func.dropna(axis=0, how='all').dropna(axis=1, how='all').sum().sum()) # normalized pair-wise square deviation (also implemented in BioNetFit as equation 3, but not normalized by the number of data * sims) if set(args.error).issuperset(set(['NPWSD'])): func = 0 for i in range(len_data): for j in range(len_sims): func += (((data.loc[i] - sims.loc[j]).divide( data.loc[i]))**2).divide(len_data * len_sims) error['NPWSD'] = '{:.6e}'.format( func.replace([numpy.inf, -numpy.inf], numpy.nan).dropna( axis=0, how='all').dropna(axis=1, how='all').sum().sum()) # normalized pair-wise absolute deviation if set(args.error).issuperset(set(['ANPWSD'])): func = 0 for i in range(len_data): for j in range(len_sims): func += (abs((data.loc[i] - sims.loc[j]).divide( data.loc[i]))).divide(len_data * len_sims) error['ANPWSD'] = '{:.6e}'.format( func.replace([numpy.inf, -numpy.inf], numpy.nan).dropna( axis=0, how='all').dropna(axis=1, how='all').sum().sum()) """ Wellek's Mann-Whitney Equivalence Test. Based on mawi.R script from the EQUIVNONINF package modifications done to perform the test "vectorized" (it compares two matrices; the first has all exp data, the second all the simulations) """ if set(args.error).issuperset(set(['WMWET'])): from scipy.stats import ncx2 # useful variables (namespace identical to mawi.R script) m = len_data # x = data n = len_sims # y = sims eps1_ = .3129 # Wellek's paper eps2_ = .2661 # Wellek's paper eqctr = 0.5 + (eps2_ - eps1_) / 2 eqleng = eps1_ + eps2_ # estimators needed for calculations wxy = pandas.DataFrame(index=sims.loc[0].index, columns=sims.loc[0].columns).fillna(0) pihxxy = pandas.DataFrame(index=sims.loc[0].index, columns=sims.loc[0].columns).fillna(0) pihxyy = pandas.DataFrame(index=sims.loc[0].index, columns=sims.loc[0].columns).fillna(0) sigmah = pandas.DataFrame(index=sims.loc[0].index, columns=sims.loc[0].columns).fillna(0) # ŷ estimator (wxy in mawi.R) # equation 1.2 from Wellek 1996 paper # for (i in 1:m) for (j in 1:n) wxy <- wxy + trunc(0.5 * (sign(x[i] - y[j]) + 1)) for i in range(m): for j in range(n): diff = (data.loc[i] - sims.loc[j]) diff = diff.dropna(axis=0, how='all').dropna(axis=1, how='all') diff = diff.apply(numpy.sign) diff = diff + 1 diff = diff.multiply(0.5) diff = diff.apply(numpy.trunc) # add to ŷ (wxy in mawi.R) wxy += diff # yFFG estimator (pihxxy in mawi.R) # equation 2.5a from Wellek 1996 paper #for (i1 in 1:(m - 1)) for (i2 in (i1 + 1):m) for (j in 1:n) pihxxy <- pihxxy + trunc(0.5 * (sign(min(x[i1], x[i2]) - y[j]) + 1)) for xi1 in range(m - 1): for xi2 in range(xi1 + 1, m): for xj in range(n): diff = data.loc[xi1].where(data.loc[xi1] < data.loc[xi2], data.loc[xi2]) - sims.loc[xj] diff = diff.dropna(axis=0, how='all').dropna(axis=1, how='all') diff = diff.apply(numpy.sign) diff = diff + 1 diff = diff.multiply(0.5) diff = diff.apply(numpy.trunc) # add to yFGG (pihxxy in mawi.R) pihxxy += diff # yFGG estimator (pihxyy in mawi.R) # equation 2.5b from Wellek 1996 paper # for (i in 1:m) for (j1 in 1:(n - 1)) for (j2 in (j1 + 1):n) pihxyy <- pihxyy + trunc(0.5 * (sign(x[i] - max(y[j1], y[j2])) + 1)) for xi in range(m): for xj1 in range(n - 1): for xj2 in range(xj1 + 1, n): diff = (data.loc[xi] - sims.loc[xj1].where( sims.loc[xj1] > sims.loc[xj2], sims.loc[xj2])) diff = diff.dropna(axis=0, how='all').dropna(axis=1, how='all') diff = diff.apply(numpy.sign) diff = diff + 1 diff = diff.multiply(0.5) diff = diff.apply(numpy.trunc) # add to yFGG (pihxyy in mawi.R) pihxyy += diff # in equation 1.2 wxy = wxy.divide(m * n) # in equation 2.5a, inverse of (m choose 2 = 0.5 * (m-1) * m), then divided by n pihxxy = pihxxy.multiply(2).divide(m * (m - 1) * n) # in equation 2.5b, inverse of (n choose 2 = 0.5 * (n-1) * n), then divided by m pihxyy = pihxyy.multiply(2).divide(n * (n - 1) * m) # variance estimator sigmah (same name as in mawi.R) # equation 2.6 from Wellek 1996 paper # sigmah <- sqrt((wxy - (m + n - 1) * wxy^2 + (m - 1) * pihxxy + (n - 1) * pihxyy)/(m * n)) sigmah = wxy - (wxy**2).multiply(m + n - 1) + pihxxy.multiply( m - 1) + pihxyy.multiply(n - 1) sigmah = sigmah.divide(m * n) sigmah = sigmah**0.5 # critical value # right hand of inequality 2.8 from Wellek 1996 paper phi = ((eqleng / 2) / sigmah)**2 # crit <- sqrt(qchisq(alpha, 1, (eqleng/2/sigmah)^2)) # Ca(phi) is the square root of the alpha-th quantile of the chi2-distribution with a single degree of freedom and non-centrality parameter phi square crit = pandas.DataFrame(data=ncx2.ppf(0.05, 1, phi), index=sims.loc[0].index, columns=sims.loc[0].columns)**.5 # compare with Z # left hand side of the inequality 2.8 from Wellek 1996 paper Z = abs((wxy - eqctr).divide(sigmah)) z = Z.copy(deep=True) """ we want to maximize the amount of true alternative hypotheses, so we purposely changed the values to use the Wellek's test as an objective function to minimize """ # test the inequality 2.8 from Wellek 1996 paper # the test cannot reject null hypothesis: P[X-Y] < .5 - e1 or P[X-Y] > .5 + e2 Z[z >= crit] = +1.0 # the null hypothesis is rejected, therefore .5 - e1 < P[X-Y] < .5 + e2 Z[z < crit] = +0.0 if args.report: print('wxy estimator:\n', wxy, '\n') print('pihxxy estimator:\n', pihxxy, '\n') print('pihxyy estimator:\n', pihxyy, '\n') print('sigmah estimator:\n', sigmah, '\n') print('phi matrix:\n', phi, '\n') print('critical values:\n', crit, '\n') print('Z estimator: \n', Z, '\n') print( 'Wellek\'s test matrix: a zero means data and simulations are equivalents within the threshold\n', Z) error['WMWET'] = '{:.0f}'.format(Z.sum().sum()) # the same as WMWET, but as identical as the Wellek's paper (look for the heaviside function) if set(args.error).issuperset(set(['WMWET_paper'])): from scipy.stats import ncx2 eps1_ = .3129 # Wellek's paper eps2_ = .2661 # Wellek's paper eqctr = 0.5 + (eps2_ - eps1_) / 2 eqleng = eps1_ + eps2_ # estimators needed for calculations wxy = pandas.DataFrame(index=y.loc[0].index, columns=y.loc[0].columns).fillna(0) pihxxy = pandas.DataFrame(index=y.loc[0].index, columns=y.loc[0].columns).fillna(0) pihxyy = pandas.DataFrame(index=y.loc[0].index, columns=y.loc[0].columns).fillna(0) sigmah = pandas.DataFrame(index=y.loc[0].index, columns=y.loc[0].columns).fillna(0) # ŷ estimator (wxy in mawi.R) # for (i in 1:m) for (j in 1:n) wxy <- wxy + trunc(0.5 * (sign(x[i] - y[j]) + 1)) for i in range(m): for j in range(n): diff = (x.loc[i] - y.loc[j]).dropna(axis=0, how='all').dropna( axis=1, how='all') wxy += numpy.heaviside(diff, 0) # yFFG estimator (pihxxy in mawi.R) #for (i1 in 1:(m - 1)) for (i2 in (i1 + 1):m) for (j in 1:n) pihxxy <- pihxxy + trunc(0.5 * (sign(min(x[i1], x[i2]) - y[j]) + 1)) for xi1 in range(m - 1): for xi2 in range(xi1 + 1, m): for xj in range(n): diff1 = (x.loc[xi1] - y.loc[xj]).dropna( axis=0, how='all').dropna(axis=1, how='all') diff2 = (x.loc[xi2] - y.loc[xj]).dropna( axis=0, how='all').dropna(axis=1, how='all') pihxxy += numpy.heaviside(diff1, 0) * numpy.heaviside( diff2, 0) # yFGG estimator (pihxyy in mawi.R) # for (i in 1:m) for (j1 in 1:(n - 1)) for (j2 in (j1 + 1):n) pihxyy <- pihxyy + trunc(0.5 * (sign(x[i] - max(y[j1], y[j2])) + 1)) for xi in range(m): for xj1 in range(n - 1): for xj2 in range(xj1 + 1, n): diff1 = (x.loc[xi] - y.loc[xj1]).dropna( axis=0, how='all').dropna(axis=1, how='all') diff2 = (x.loc[xi] - y.loc[xj2]).dropna( axis=0, how='all').dropna(axis=1, how='all') pihxyy += numpy.heaviside(diff1, 0) * numpy.heaviside( diff2, 0) # wxy = wxy.divide(m * n) pihxxy = pihxxy.multiply(2).divide(m * (m - 1) * n) pihxyy = pihxyy.multiply(2).divide(n * (n - 1) * m) # variance estimator sigmah (same name as in mawi.R) # sigmah <- sqrt((wxy - (m + n - 1) * wxy^2 + (m - 1) * pihxxy + (n - 1) * pihxyy)/(m * n)) sigmah = wxy - (wxy**2).multiply(m + n - 1) + pihxxy.multiply( m - 1) + pihxyy.multiply(n - 1) sigmah = sigmah.divide(m * n) sigmah = sigmah**0.5 # critical value # crit <- sqrt(qchisq(alpha, 1, (eqleng/2/sigmah)^2)) phi = (eqleng / 2 / sigmah)**2 crit = pandas.DataFrame(data=ncx2.ppf(0.05, 1, phi), index=y.loc[0].index, columns=y.loc[0].columns)**.5 # compare with Z Z = abs((wxy - eqctr).divide(sigmah)) z = Z.copy(deep=True) Z[z < crit] = +0.0 # the null hypothesis is rejected, therefore .5 - e1 < P[X-Y] < .5 + e2 Z[z >= crit] = +1.0 # the test cannot reject the null hypothesis: P[X-Y] < .5 - e1 or P[X-Y] > .5 + e2 if args.report: print('wxy estimator:\n', wxy, '\n') print('pihxxy estimator:\n', pihxxy, '\n') print('pihxyy estimator:\n', pihxyy, '\n') print('sigmah estimator:\n', sigmah, '\n') print('phi matrix:\n', phi, '\n') print('critical values:\n', crit, '\n') print('Z estimator: \n', Z, '\n') print( 'Wellek\'s test matrix: a zero means data and simulations are equivalents within the threshold\n', Z) error['WMWET_paper'] = '{:.0f}'.format(Z.sum().sum()) if set(args.error).issuperset(set(['TOST'])): print( "WARNING: data and/or simulations not necessarily are normal distributions." ) print( "As a test-bed, we consider data and simulations have unequal standard deviations" ) print( "See https://www.statsmodels.org/devel/generated/statsmodels.stats.weightstats.ttost_ind.html for more information" ) from statsmodels.stats.weightstats import ttost_ind if not args.do_all: data_stdv = dostdv(data, len_data) # reshape data and sims to allow calculate the test in a for-loop tost_sims = numpy.dstack([sims.loc[x] for x in range(len_sims)]) # since we operate numpy arrays without labels, we must ensure sims and data indexes and columns have the same order index = data.loc[0].index columns = data.loc[0].columns tost_data = numpy.dstack([ data.loc[x].reindex(columns=columns, index=index) for x in range(len_data) ]) p = numpy.zeros((len(data_stdv.index), len(data_stdv.columns))) row = 0 for x, y, lim in zip(tost_sims, tost_data, data_stdv.values): for col, _ in enumerate(data_stdv.columns): p[row, col] = ttost_ind(x[col], y[col], -lim[col], +lim[col])[0] row += 1 # transform matrix of p-values into a non-rejection DataFrame (if p-value less than 5% -> rejects, but set to zero) p = pandas.DataFrame(index=index, columns=columns, data=p) P = p.copy(deep=True) P[p >= .05] = +1.0 P[p < .05] = +0.0 if args.report: print( 'Two one-sided t-tests matrix: a zero means data and simulations are equivalents within one standard deviation threshold\n', P) error['TOST'] = '{:.0f}'.format(P.sum().sum()) # Mann-Whitney U-test def mwut(data, sims, alternative): ucrit = pandas.read_csv(args.crit, sep=None, engine='python', header=0, index_col=0) udata = pandas.DataFrame(index=sims.loc[0].index, columns=sims.loc[0].columns).fillna(0) usims = pandas.DataFrame(index=sims.loc[0].index, columns=sims.loc[0].columns).fillna(0) for i in range(len_data): for j in range(len_sims): Diff = (data.loc[i] - sims.loc[j]).dropna( axis=0, how='all').dropna(axis=1, how='all') diff = Diff.copy(deep=True) # transform data # if data < sims, count -1.0 Diff[diff < 0] = -1.0 # if data > sims, count +1.0 Diff[diff > 0] = +1.0 # if data = sims, count +0.5 Diff[diff == 0] = +0.5 # count how many times is data < sims (udata and usims are complementary) diff = Diff.copy(deep=True) udata += Diff[diff == -1.0].fillna(0).divide(-1) + Diff[ diff == +0.5].fillna(0) usims += Diff[diff == +1.0].fillna(0).divide(+1) + Diff[ diff == +0.5].fillna(0) if alternative == 'two-sided': # bigU is max(udata, usims), where udata and usims are DataFrames bigU = udata.where(udata >= usims).fillna( usims.where(usims >= udata)) if alternative == 'less': bigU = udata if alternative == 'greater': bigU = usims U = len_data * len_sims - bigU u = U.copy(deep=True) # U is significant if it is less than or equal to a critical value U[u <= ucrit.loc[len_sims, str(len_data)]] = +1.0 U[u > ucrit.loc[len_sims, str(len_data)]] = +0.0 if args.report: print('U-estimator for data\n', udata, '\n') print('U-estimator for sims\n', usims, '\n') if alternative == 'two-sided': print( 'U-test matrix: A one means data and sims are differents\n', U, '\n') if alternative == 'less': print( 'U-test matrix: A one means data is smaller than sims (shifted to the right)\n', U, '\n') if alternative == 'greater': print( 'U-test matrix: A one means data is greater than sims (shifted to the left)\n', U, '\n') return '{:.0f}'.format(U.sum().sum()), U if set(args.error).issuperset(set(['MWUT'])): if (len_data >= 3 and len_sims >= 3): error['MWUT'] = mwut(data, sims, 'two-sided')[0] else: error['MWUT'] = str(numpy.nan) if set(args.error).issuperset(set(['DUT'])): if (len_data >= 3 and len_sims >= 3): # set what the user wants if args.lower is not None and args.upper is None: args.upper = args.lower # symmetric equivalence interval if args.lower is None and args.upper is not None: args.lower = args.upper # symmetric equivalence interval if args.lower is None and args.upper is None: if not args.do_all: if args.stdv == 'sims': lower = upper = dostdv(sims, len_sims) else: lower = upper = dostdv(data, len_data) else: if args.stdv == 'sims': lower = upper = sims_stdv else: lower = upper = data_stdv # divide by factor lower = lower / float(args.factor) upper = upper / float(args.factor) # copy simulations to a temporary variable tmp = sims # test lower limit new_sims = [] for i in range(len_sims): new_sims.append(tmp.loc[i] - lower) sims = pandas.concat(new_sims, keys=range(len_sims)) # test data > sims - lower with one-tail U-test LB = mwut(data, sims, 'greater')[1] # test upper limit new_sims = [] for i in range(len_sims): new_sims.append(tmp.loc[i] + upper) sims = pandas.concat(new_sims, keys=range(len_sims)) # test data < sims + upper with one UB = mwut(data, sims, 'less')[1] # rejection DataFrame (U-test report with ones true alternative hypotheses) # both one-sided tests should reject the null hypotheses U = LB * UB # However, we minimize the number of non-rejected null hypotheses # transform U into a non-rejection DataFrame. U = numpy.logical_xor(U.values, 1).astype(int) U = pandas.DataFrame(index=LB.index, columns=LB.columns, data=U) if args.report: print( 'Double U-test matrix: 1.0 means data and sims are not equivalents if sims are shifted:\n', U, '\n') error['DUT'] = '{:.0f}'.format(U.sum().sum()) else: error['DUT'] = str(numpy.nan)