def iso_heterochronicity(language_annotator): # stats.ks_2samp(uniform_ratio_list,your_list) lengths = data["length_filtered"][data["language_annotator"] == language_annotator][:-1].values lengths = lengths[~numpy.isnan(lengths)] test_iso = stats.kstest(lengths, "norm", N=1000000, args=([numpy.mean(lengths)]))[0] test_het = stats.kstest(lengths, "uniform", N=1000000, args=([0, 0.5]))[0] return test_iso, test_het
def _get_xy_dataset_statistics(x_values, y_values, fcorrect_x_cutoff = 1.0, fcorrect_y_cutoff = 1.0, x_fuzzy_range = 0.1, y_scalar = 1.0): ''' A function which takes two lists of values of equal length with corresponding entries and returns a dict containing a variety of metrics. :param x_values: A list of values for the X-axis (experimental values). :param y_values: A list of values for the X-axis (predicted values). :param fcorrect_x_cutoff: See get_xy_dataset_statistics. :param fcorrect_y_cutoff: See get_xy_dataset_statistics. :param x_fuzzy_range: See get_xy_dataset_statistics. :param y_scalar: See get_xy_dataset_statistics. :return: A table of statistics. ''' from scipy.stats import pearsonr, spearmanr, normaltest, ks_2samp, kstest, norm assert(len(x_values) == len(y_values)) return dict( pearsonr = pearsonr(x_values, y_values), spearmanr = spearmanr(x_values, y_values), gamma_CC = gamma_CC(x_values, y_values), MAE = mae(x_values, y_values), normaltestx = normaltest(x_values), normaltesty = normaltest(y_values), kstestx = kstest(x_values, 'norm'), kstesty = kstest(y_values, 'norm'), ks_2samp = ks_2samp(x_values, y_values), fraction_correct = fraction_correct(x_values, y_values, x_cutoff = fcorrect_x_cutoff, y_cutoff = fcorrect_y_cutoff), fraction_correct_fuzzy_linear = fraction_correct_fuzzy_linear(x_values, y_values, x_cutoff = fcorrect_x_cutoff, x_fuzzy_range = x_fuzzy_range, y_scalar = y_scalar), )
def __init__(self, data, **kwargs): r"""Constructor. This will fit both chi2 function in the different regimes. *data* - Data sample to use for fitting Keyword Argument: *chi1/2* - Keyword arguments like floc, fshape, etc. that are passed to the constructor of the corresponding chi2 scipy object. """ data = np.asarray(data) c1 = kwargs.pop("chi1", dict()) c2 = kwargs.pop("chi2", dict()) self.par1 = chi2.fit(data[data > 0.], **c1) self.par2 = chi2.fit(-data[data < 0.], **c2) self.f1 = chi2(*self.par1) self.f2 = chi2(*self.par2) self.eta = float(np.count_nonzero(data > 0.)) / len(data) self.eta_err = np.sqrt(self.eta * (1. - self.eta) / len(data)) # get fit-quality self.ks1 = kstest(data[data > 0.], "chi2", args=self.par1)[1] self.ks2 = kstest(-data[data < 0.], "chi2", args=self.par2)[1] return
def start_routine(filename, P, Me_range, Re_range, n_range, noise_level, R, zp): noise = make_noise(R, noise_level) with open(filename, 'wb') as csvfile: writer = csv.writer(csvfile, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL) header = ['%s = %.2f' % (P[i].name, P[i].value) for i in ['MeD', 'ReD', 'nD']] writer.writerow(header + ['range= [%.1f, %.1f]' % (R[0], R[-1])] + ['noise level = %.1f' % noise_level]) writer.writerow(['MeB_initial', 'ReB_initial', 'nB_initial', 'MeD_final', 'ReD_final', 'nD_final', 'MeB_final', 'ReB_final', 'nB_final',\ 'redchi2_all', 'redchi2_excl', 'KS', 'KS_excl']) for nB in n_range: for ReB in Re_range: for MeB in Me_range: pars = S.copy_params(P, False) pars.add_many(('MeB', float(MeB), True, 1.), ('ReB', float(ReB), True, 0.01), ('nB', float(nB), True, 0.1)) pars['nD'].vary = False test_gal = S.sersic2(pars, R, zp, False) + noise new_pars = S.copy_params(pars, False) fit_data, res_excl = S.fit(new_pars, S.sersic2, R, zp, test_gal, weights=None, fit_range=None, redchi_marker=30.) initials = [pars[i].value for i in ['MeB', 'ReB', 'nB']] if fit_data is None: writer.writerow(['N/A'] * 13) else: finals = [new_pars[i].value for i in ['MeB', 'ReB', 'nB', 'MeD', 'ReD', 'nD']] redchi_excl = np.sum(res_excl) / fit_data.nfree KS, KS_excl = stats.kstest(fit_data.residual, 'norm')[1], stats.kstest(res_excl, 'norm')[1] writer.writerow(initials+finals+['%r' % (res_excl)])
def fit_maxwell(input): print os.path.basename(input) data = [] with open(input) as file: for line in file: data.append(float(line.strip())) #total = sum(data) #data = [d / total for d in data] #print data maxwell = stats.maxwell params = maxwell.fit(data, floc=0) print params d, p = stats.kstest(data, "maxwell", mode="asymp") print d, p norm = stats.norm params = norm.fit(data) #, floc=0) print params d, p = stats.kstest(data, "norm", mode="asymp") print d, p plt.hist(data, bins=50, normed=True, alpha=0.6, color='g') # Plot the PDF. xmin, xmax = plt.xlim() x = np.linspace(xmin, xmax, 100) p = norm.pdf(x, params[0], params[1]) plt.plot(x, p, 'k', linewidth=2) title = "Fit results: mu = %.2f, std = %.2f" % params plt.title(title) plt.show()
def testExponentialSampleMultiDimensional(self): with self.test_session(): batch_size = 2 lam_v = [3.0, 22.0] lam = constant_op.constant([lam_v] * batch_size) exponential = exponential_lib.Exponential(rate=lam) n = 100000 samples = exponential.sample(n, seed=138) self.assertEqual(samples.get_shape(), (n, batch_size, 2)) sample_values = samples.eval() self.assertFalse(np.any(sample_values < 0.0)) for i in range(2): self.assertLess( stats.kstest( sample_values[:, 0, i], stats.expon(scale=1.0 / lam_v[i]).cdf)[0], 0.01) self.assertLess( stats.kstest( sample_values[:, 1, i], stats.expon(scale=1.0 / lam_v[i]).cdf)[0], 0.01)
def test_unit_vector(self): with self._model(): UnitVector("x", shape=(2, 3)) trace = self._sample() # Make sure that the unit vector constraint is satisfied assert np.allclose(np.sum(trace["x"]**2, axis=-1), 1.0) # Pull out the component and compute the angle x = trace["x"][:, :, 0] y = trace["x"][:, :, 1] z = trace["x"][:, :, 2] theta = np.arctan2(y, x) # The angle should be uniformly distributed cdf = lambda x: np.clip((x + np.pi) / (2 * np.pi), 0, 1) # NOQA for i in range(theta.shape[1]): s, p = kstest(theta[:, i], cdf) assert s < 0.05 # As should the vertical component cdf = lambda x: np.clip((x + 1) / 2, 0, 1) # NOQA for i in range(z.shape[1]): s, p = kstest(z[:, i], cdf) assert s < 0.05
def test_iterative_imputer_truncated_normal_posterior(): # test that the values that are imputed using `sample_posterior=True` # with boundaries (`min_value` and `max_value` are not None) are drawn # from a distribution that looks gaussian via the Kolmogorov Smirnov test. # note that starting from the wrong random seed will make this test fail # because random sampling doesn't occur at all when the imputation # is outside of the (min_value, max_value) range pytest.importorskip("scipy", minversion="0.17.0") rng = np.random.RandomState(42) X = rng.normal(size=(5, 5)) X[0][0] = np.nan imputer = IterativeImputer(min_value=0, max_value=0.5, sample_posterior=True, random_state=rng) imputer.fit_transform(X) # generate multiple imputations for the single missing value imputations = np.array([imputer.transform(X)[0][0] for _ in range(100)]) assert all(imputations >= 0) assert all(imputations <= 0.5) mu, sigma = imputations.mean(), imputations.std() ks_statistic, p_value = kstest((imputations - mu) / sigma, 'norm') if sigma == 0: sigma += 1e-12 ks_statistic, p_value = kstest((imputations - mu) / sigma, 'norm') # we want to fail to reject null hypothesis # null hypothesis: distributions are the same assert ks_statistic < 0.2 or p_value > 0.1, \ "The posterior does appear to be normal"
def get_chiKS(profile, fitsDF, mu_list, h_list, break_R, bounds, infoDF): P = lm.Parameters() P.add_many(('mu01',mu_list[0]), ('mu02', mu_list[1]), ('h1', h_list[0]), ('h2', h_list[1])) P.add('Rbr', break_R) P.add_many(('nB', fitsDF.nB), ('ReB', fitsDF.ReB), ('MB', fitsDF.MB), ('deltaRe', 1., True), ('BD_ratio', fitsDF.BD_ratio), ('ReD', fitsDF.ReD)) res = (profile.I - total_model(P, profile.R.values, infoDF.zp, False)) / profile.I_err res_norm = F.sersic(P, infoDF.zp, profile.R.values, profile.I.values, profile.I_err, False) return np.sum(res**2.), stats.kstest(res, 'norm'), np.sum(res_norm**2.), stats.kstest(res_norm, 'norm')
def check_distribution(dist, args, alpha): D,pval = stats.kstest(dist,'', args=args, N=1000) if (pval < alpha): D,pval = stats.kstest(dist,'',args=args, N=1000) #if (pval < alpha): # D,pval = stats.kstest(dist,'',args=args, N=1000) assert (pval > alpha), "D = " + str(D) + "; pval = " + str(pval) + \ "; alpha = " + str(alpha) + "\nargs = " + str(args)
def check_distribution_rvs(dist, args, alpha, rvs): # test from scipy.stats.tests # this version reuses existing random variables D, pval = stats.kstest(rvs, dist, args=args, N=1000) if (pval < alpha): D, pval = stats.kstest(dist, '', args=args, N=1000) npt.assert_(pval > alpha, "D = " + str(D) + "; pval = " + str(pval) + "; alpha = " + str(alpha) + "\nargs = " + str(args))
def single_exp(self, options): """ This method returns a line in the form Original_variable | real_parameters | number_of_observations | log-likelihood_of_real_params | Estim_params_T2_SSPSC | log-likelihood_T2_SSPSC | p-value_T2_SSPSC | Estim_params_T2_StSI | log-likelihood_T2_StSI | p-value_T2_StSI | AIC_selected_model | AIC_relative_prob The input is an array of options containing: ['choice of distrib', 'param1, param2', 'number of values'] The input is going to be used to know the kind of experiment we are going to do. The procedure is to simulate data with one distribution and then to do a KS test for determining if the data comes from a SSPSC or from a StSI model. Moreover, it computes AIC values in order to say wich model explains better the simulated data """ params = options[1].split(',') if options[0]=='1': alpha=float(params[0]) T=float(params[1]) X = T2_SSPSC(alpha, T) type_of_variable = 'T2_SSPSC' real_params = '({}, {})'.format(alpha, T) elif options[0]=='2': n=int(params[0]) M=float(params[1]) X = T2_StSI(n, M) type_of_variable = 'T2_StSI' real_params = (n, M).__str__() number_of_observations = int(options[2]) alpha_integer = options[3] # We are going to simulate "number_of_observations" independent values. Then we use a half of that values for parameters estimation and the other half for the KS-test # Simulating values obs_estim = X.simulate_values(number_of_observations/2) obs_test = X.simulate_values(number_of_observations/2) obs = obs_estim + obs_test real_likelihood = X.log_likelihood(obs_estim) # Estimating parameters of both models and doing a KS test B = T2_SSPSC() # We initialize a variable with default parameters if int(alpha_integer) == 0: [alpha, T, ll_fittedB] = B.exact_maxllk(obs_estim) else: [alpha, T, ll_fittedB] = B.exact_maxllk_integer(obs_estim) fittedB = T2_SSPSC(alpha, T) KS_fittedB = kstest(obs_test, fittedB.cdf) pvalue_fittedB = KS_fittedB[1] S = T2_StSI() # We initialize a variable with default parameters [n, M, ll_fittedS] = S.max_likelihood_estimation(obs_estim) fittedS = T2_StSI(n, M) KS_fittedS = kstest(obs_test, fittedS.cdf) pvalue_fittedS = KS_fittedS[1] # Computing AIC values (best_model, relative_prob) = self.AIC_compare(ll_fittedB, ll_fittedS) result_text= type_of_variable+' | '+real_params+' | '+options[2]+' | '+real_likelihood.__str__()+' | '+(fittedB.alpha, fittedB.T).__str__()+' | '+ll_fittedB.__str__()+' | '+pvalue_fittedB.__str__()+' | '+(fittedS.n, fittedS.M).__str__()+' | '+ll_fittedS.__str__()+' | '+pvalue_fittedS.__str__()+' | '+best_model.__str__()+' | '+relative_prob.__str__() observations = obs.__str__() return [result_text, observations]
def check_distribution_rvs(distfn, args, alpha, rvs): ## signature changed to avoid calling a distribution by name # test from scipy.stats.tests # this version reuses existing random variables D,pval = stats.kstest(rvs, distfn.cdf, args=args, N=1000) if (pval < alpha): D,pval = stats.kstest(distfn.rvs, distfn.cdf, args=args, N=1000) npt.assert_(pval > alpha, "D = " + str(D) + "; pval = " + str(pval) + "; alpha = " + str(alpha) + "\nargs = " + str(args))
def printresults(sample, arg, bres, kind='bootstrap'): '''calculate and print Bootstrap or Monte Carlo result Parameters ---------- sample : array original sample data arg : float (for general case will be array) bres : array parameter estimates from Bootstrap or Monte Carlo run kind : {'bootstrap', 'montecarlo'} output is printed for Mootstrap (default) or Monte Carlo Returns ------- None, currently only printing Notes ----- still a bit a mess because it is used for both Bootstrap and Monte Carlo made correction: reference point for bootstrap is estimated parameter not clear: I'm not doing any ddof adjustment in estimation of variance, do we need ddof>0 ? todo: return results and string instead of printing ''' print 'true parameter value' print arg print 'MLE estimate of parameters using sample (nobs=%d)'% (nobs) argest = distr.fit_fr(sample, frozen=[np.nan, 0.0, 1.0]) print argest if kind == 'bootstrap': #bootstrap compares to estimate from sample argorig = arg arg = argest print '%s distribution of parameter estimate (nrepl=%d)'% (kind, nrepl) print 'mean = %f, bias=%f' % (bres.mean(0), bres.mean(0)-arg) print 'median', np.median(bres, axis=0) print 'var and std', bres.var(0), np.sqrt(bres.var(0)) bmse = ((bres - arg)**2).mean(0) print 'mse, rmse', bmse, np.sqrt(bmse) bressorted = np.sort(bres) print '%s confidence interval (90%% coverage)' % kind print bressorted[np.floor(nrepl*0.05)], bressorted[np.floor(nrepl*0.95)] print '%s confidence interval (90%% coverage) normal approximation' % kind print stats.norm.ppf(0.05, loc=bres.mean(), scale=bres.std()), print stats.norm.isf(0.05, loc=bres.mean(), scale=bres.std()) print 'Kolmogorov-Smirnov test for normality of %s distribution' % kind print ' - estimated parameters, p-values not really correct' print stats.kstest(bres, 'norm', (bres.mean(), bres.std()))
def test_pos4d_transforms_slit_rotated(photons1000): '''Test coordinate transforms on rotated entrance aperture.''' p = photons1000 rotation = axangle2aff(np.array([0, 1, 0]), np.deg2rad(90)) myslit = marxs.optics.aperture.RectangleAperture(orientation=rotation[:3, :3], zoom=0.5) p = myslit.process_photons(p) assert np.allclose(p['pos'][:, 2], 0) assert kstest(p['pos'][:, 0] + 0.5, "uniform")[1] > 0.01 assert kstest(p['pos'][:, 1] + 0.5, "uniform")[1] > 0.01
def kstest(): n1 = 200 n2 = 300 a = stats.norm.rvs(size=n1, loc=0, scale=1) b = stats.norm.rvs(size=n2, loc=0.5, scale=1.5) c = stats.norm.rvs(size=n2, loc=0.01, scale=1) print(stats.ks_2samp(a, b)) print(stats.ks_2samp(a, c)) print stats.kstest(a, 'norm')
def test_step(self): o,c = covariance('c',pm.Gamma('v',3,3,size=2)) M = pm.MCMC([o]) M.sample(10000,1000) theta1=(np.arctan2(M.trace('c_eigenvalues')[:][:,0,0],M.trace('c_eigenvalues')[:][:,0,1])+np.pi)/2./np.pi theta2 =(np.arctan2(M.trace('c_eigenvalues')[:][:,1,0],M.trace('c_eigenvalues')[:][:,1,1])+np.pi)/2./np.pi d1,p1 = stats.kstest(theta1,'uniform') d2,p2 = stats.kstest(theta2,'uniform') assert(p1>.05) assert(p2>.05)
def check_normality(): '''Check if the distribution is normal.''' # Set the parameters numData = 1000 myMean = 0 mySD = 3 # To get reproducable values, I provide a seed value np.random.seed(1234) # Generate and show random data data = stats.norm.rvs(myMean, mySD, size=numData) fewData = data[:100] plt.hist(data) plt.show() # --- >>> START stats <<< --- # Graphical test: if the data lie on a line, they are pretty much # normally distributed _ = stats.probplot(data, plot=plt) plt.show() pVals = pd.Series() pFewVals = pd.Series() # The scipy normaltest is based on D-Agostino and Pearsons test that # combines skew and kurtosis to produce an omnibus test of normality. _, pVals['Omnibus'] = stats.normaltest(data) _, pFewVals['Omnibus'] = stats.normaltest(fewData) # Shapiro-Wilk test _, pVals['Shapiro-Wilk'] = stats.shapiro(data) _, pFewVals['Shapiro-Wilk'] = stats.shapiro(fewData) # Or you can check for normality with Lilliefors-test _, pVals['Lilliefors'] = lillifors(data) _, pFewVals['Lilliefors'] = lillifors(fewData) # Alternatively with original Kolmogorov-Smirnov test _, pVals['Kolmogorov-Smirnov'] = stats.kstest((data-np.mean(data))/np.std(data,ddof=1), 'norm') _, pFewVals['Kolmogorov-Smirnov'] = stats.kstest((fewData-np.mean(fewData))/np.std(fewData,ddof=1), 'norm') print('p-values for all {0} data points: ----------------'.format(len(data))) print(pVals) print('p-values for the first 100 data points: ----------------') print(pFewVals) if pVals['Omnibus'] > 0.05: print('Data are normally distributed') # --- >>> STOP stats <<< --- return pVals['Kolmogorov-Smirnov']
def test_pos4d_transforms_slit(photons1000, myslit): '''Test coordinate transforms on initialization of optical elements. The initial 4D transforms should be done to any optical element. Here, I pick the entrance aperture for testing, because it places the positional vector of the plucker coordinates in a plane, independent of the initial values. ''' p = myslit.process_photons(photons1000) assert np.allclose(p['pos'][:, 0], 0) assert kstest((p['pos'][:, 1] + 2) / 4, "uniform")[1] > 0.01 assert kstest((p['pos'][:, 2] + 2) / 4, "uniform")[1] > 0.01
def test_create_dataset(self): gen = TSPGenerator(self._num_points) data = gen.generate() nose.tools.assert_equal(data.shape, (self._num_points, 2)) # check x axis is drawn from uniform distribution D, p_value = stats.kstest(data[:, 0], 'uniform', args=(0, 10)) nose.tools.assert_greater(p_value, 0.05) # check y axis is drawn from uniform distribution D, p_value = stats.kstest(data[:, 1], 'uniform', args=(0, 10)) nose.tools.assert_greater(p_value, 0.05)
def draw_normal (self, xs, bins): mean = np.mean(xs) std = np.std(xs) norm_d_val, norm_p_val = kstest(xs, 'norm') t_d_val, t_p_val = kstest(xs, 't', args=(1,)) print 'Mean: {}, Std: {}, KS_D: {}'.format(mean, std, norm_d_val) legendtext = '$\mu$ = {:.4f}\n$\sigma$ = {:.4f}\nD (Normal) = {:.4f}\nD (T) = {:.4f}'.format(mean, std, norm_d_val, t_d_val) plt.text(-15, 0.12, legendtext, fontsize='x-small') y = mlab.normpdf(bins, mean, std) plt.plot(bins, y, '--', color='grey')
def test_ks(wave, flux, p_optg, p_optl): xmin = np.min(wave) xmax = np.max(wave) x = np.linspace(xmin, xmax, 1000) # Gauss y_gauss_ord = np.sort(rectagauss(x, *p_optg)) y_expg_ord = np.sort(flux) dng, probg = kstest(y_expg_ord, cdf, args=(y_gauss_ord,)) # Lorentz y_lorentz_ord = np.sort(rectalorentz(x, *p_optl)) y_expl_ord = np.sort(flux) dnl, probl = kstest(y_expl_ord, cdf, args=(y_lorentz_ord,)) return probg, probl
def test_kstest(self): for varname, cdf in self.cdfs.items(): samples = self.samples[varname] if samples.ndim == 1: t, p = stats.kstest(samples[::self.ks_thin], cdf=cdf) assert self.alpha < p elif samples.ndim == 2: pvals = [] for samples_, cdf_ in zip(samples.T, cdf): t, p = stats.kstest(samples_[::self.ks_thin], cdf=cdf_) pvals.append(p) t, p = stats.combine_pvalues(pvals) assert self.alpha < p else: raise NotImplementedError()
def testSamplesAgreeWithCdfForSamplesOverLargeRange(self): # Consider the cdf for distribution X, F(x). # If U ~ Uniform[0, 1], then Y := F^{-1}(U) is distributed like X since # P[Y <= y] = P[F^{-1}(U) <= y] = P[U <= F(y)] = F(y). # If F is a bijection, we also have Z = F(X) is Uniform. # # Make an exponential with large mean (= 100). This ensures we will get # quantized values over a large range. This large range allows us to # pretend that the cdf F is a bijection, and hence F(X) is uniform. # Note that F cannot be bijection since it is constant between the # integers. Hence, F(X) (see below) will not be uniform exactly. with self.test_session(): qdist = distributions.QuantizedDistribution( distribution=distributions.Exponential(lam=0.01)) # X ~ QuantizedExponential x = qdist.sample_n(n=10000, seed=42) # Z = F(X), should be Uniform. z = qdist.cdf(x) # Compare the CDF of Z to that of a Uniform. # dist = maximum distance between P[Z <= a] and P[U <= a]. # We ignore pvalue, since of course this distribution is not exactly, and # with so many sample points we would get a false fail. dist, _ = stats.kstest(z.eval(), "uniform") # Since the distribution take values (approximately) in [0, 100], the # cdf should have jumps (approximately) every 1/100 of the way up. # Assert that the jumps are not more than 2/100. self.assertLess(dist, 0.02)
def calc_ks_stats(scores, exp_scores=None): from scipy import stats if exp_scores: (D, p_val) = stats.ks_2samp(scores, exp_scores) else: (D, p_val) = stats.kstest(scores, stats.uniform.cdf) return {'D':D, 'p_val':p_val}
def test_statistics(self): # There is a statistical test and has a non-zero chance of failure during normal operation. # Re-run the test to see if the error persists. for rate in [123.0*Hz, 0.123*kHz]: for t_stop in [2345*ms, 2.345*second]: spiketrain = stgen.homogeneous_poisson_process(rate, t_stop=t_stop) intervals = isi(spiketrain) expected_spike_count = int((rate * t_stop).simplified) self.assertLess(pdiff(expected_spike_count, spiketrain.size), 0.2) # should fail about 1 time in 1000 expected_mean_isi = (1/rate) self.assertLess(pdiff(expected_mean_isi, intervals.mean()), 0.2) expected_first_spike = 0*ms self.assertLess(spiketrain[0] - expected_first_spike, 7*expected_mean_isi) expected_last_spike = t_stop self.assertLess(expected_last_spike - spiketrain[-1], 7*expected_mean_isi) # Kolmogorov-Smirnov test D, p = kstest(intervals.rescale(t_stop.units), "expon", args=(0, expected_mean_isi.rescale(t_stop.units)), # args are (loc, scale) alternative='two-sided') self.assertGreater(p, 0.001) self.assertLess(D, 0.12)
def test_permuted_ols_check_h0_noeffect_signswap(random_state=0): rng = check_random_state(random_state) # design parameters n_samples = 100 # create dummy design with no effect target_var = rng.randn(n_samples, 1) tested_var = np.ones((n_samples, 1)) # permuted OLS # We check that h0 is close to the theoretical distribution, which is # known for this simple design (= t(n_samples - dof)). perm_ranges = [10, 100, 1000] # test various number of permutations all_kstest_pvals = [] # we compute the Mean Squared Error between cumulative Density Function # as a proof of consistency of the permutation algorithm all_mse = [] for i, n_perm in enumerate(np.repeat(perm_ranges, 10)): pval, orig_scores, h0 = permuted_ols( tested_var, target_var, model_intercept=False, n_perm=n_perm, two_sided_test=False, random_state=i) assert_equal(h0.size, n_perm) # Kolmogorov-Smirnov test kstest_pval = stats.kstest(h0, stats.t(n_samples).cdf)[1] all_kstest_pvals.append(kstest_pval) mse = np.mean( (stats.t(n_samples).cdf(np.sort(h0)) - np.linspace(0, 1, h0.size + 1)[1:]) ** 2) all_mse.append(mse) all_kstest_pvals = np.array(all_kstest_pvals).reshape( (len(perm_ranges), -1)) all_mse = np.array(all_mse).reshape((len(perm_ranges), -1)) # check that a difference between distributions is not rejected by KS test assert_array_less(0.01 / (len(perm_ranges) * 10.), all_kstest_pvals) # consistency of the algorithm: the more permutations, the less the MSE assert_array_less(np.diff(all_mse.mean(1)), 0)
def test_statistics(self): # There is a statistical test and has a non-zero chance of failure during normal operation. # Re-run the test to see if the error persists. a = 3.0 for b in (67.0*Hz, 0.067*kHz): for t_stop in (2345*ms, 2.345*second): spiketrain = stgen.homogeneous_gamma_process(a, b, t_stop=t_stop) intervals = isi(spiketrain) expected_spike_count = int((b/a * t_stop).simplified) self.assertLess(pdiff(expected_spike_count, spiketrain.size), 0.25) # should fail about 1 time in 1000 expected_mean_isi = (a/b).rescale(ms) self.assertLess(pdiff(expected_mean_isi, intervals.mean()), 0.3) expected_first_spike = 0*ms self.assertLess(spiketrain[0] - expected_first_spike, 4*expected_mean_isi) expected_last_spike = t_stop self.assertLess(expected_last_spike - spiketrain[-1], 4*expected_mean_isi) # Kolmogorov-Smirnov test D, p = kstest(intervals.rescale(t_stop.units), "gamma", args=(a, 0, (1/b).rescale(t_stop.units)), # args are (a, loc, scale) alternative='two-sided') self.assertGreater(p, 0.001) self.assertLess(D, 0.25)
def makeKS(request, genotype_ids, modelName): allModelNames = getModelNames() data = {} criticalValue = 0.05 for model in allModelNames: genotype = get_object_or_404(Genotype, pk=genotype_ids[0]) exec("seqSet = genotype.%s_set.all()"%model) allSeqLens = [] for innerItem in seqSet: allSeqLens.append(innerItem.seqLen) allSeqLens = np.array(allSeqLens) mu = np.mean(allSeqLens) sigma = np.std(allSeqLens) normed_allSeqLens = (allSeqLens - mu)/sigma result = stats.kstest(normed_allSeqLens, 'norm') if result[1] >= 0.05: hyp = True else: hyp = False data[model] = {"kstest":result, "hypothesis":hyp} return {'result': data, 'modelName':modelName}
def getbcdf_pval_swc(self, swc): """ get p-value of beta CDF with candidated sigma weight parameters ---------- returns ------- pval: d: Y: ba: bb: bcdf: beta.cdf """ Y = [self.kernelizeisw(x, swc) for x in self._data] Y.sort() Y = featureScaling(Y) y_m = np.mean(Y) y_v = np.var(Y, ddof = 1) if math.isnan(y_v) or y_v == 0: return 0 ba = y_m ** 2 * ((1 - y_m) / y_v - 1 / y_m) bb = ba * (1 - y_m) / y_m bcdf = beta.cdf(Y, ba, bb) # Y = featureScaling(Y) d, pval = scistats.kstest(Y, lambda cdf: bcdf) params = p3c(pval, d, Y, [x for x in swc], ba, bb, 0, bcdf) return params
commandline="\ %s --detectors %s --par-file %s --input-files %s --outfile %s --prior-file %s --Nlive %s --Nmcmcinitial %s --sampleprior %s" \ % (execu, dets, parf, datafile, outfile, priorf, Nlive, Nmcmcinitial, priorsamples) sp.check_call(commandline, shell=True) # read in prior samples f = h5py.File(outfile, 'r') a = f['lalinference'] h0samps = a['lalinference_nest']['nested_samples']['H0'][:] # get normed histogram of samples [n, nedges] = np.histogram(h0samps, bins=20, range=(0., h0ul), density=True) nc = np.cumsum(n)*(nedges[1]-nedges[0]) stat, p = ss.kstest(nc, 'uniform') print "K-S test p-value for upper range of %e = %f" % (h0ul, p) if p < 0.005: print "There might be a problem for this prior distribution" import matplotlib.pyplot as pl fig, ax = pl.subplots(1, 1) ax.hist(h0samps, bins=20, normed=True, cumulative=True, histtype='stepfilled', alpha=0.2) ax.plot([0., h0ul], [0., 1], 'k--') ax.set_xlim((0., h0ul)) ax.set_ylim((0., 1.)) ax.set_xlabel('h_0') ax.set_ylabel('Cumulative probability') pl.show() break
x = np.linspace(mu - 4 * sigma, mu + 4 * sigma) plt.plot(x, stats.norm.cdf(x, mu, sigma), "b-") alpha = 0.01 n = 10000 mu = 25 sigma = 2 plus_deviations = [] minus_deviations = [] # Genero n números aleatorios siguiendo una distribucion N(mu, sigma) numbers = generate_by_acceptance_rejection(n, mu, sigma) # Efectua el test Komogorov-Smirnov y obtiene el p-value statistic, pvalue = stats.kstest(numbers, stats.norm(loc=mu, scale=sigma).cdf) print("Nivel de Significacion: {:.2f} ".format(alpha)) print("p-valor: {:.2f} ".format(pvalue)) # Comparo el p-value con el nivel de significancia deseado, y si es mayor, entonces # no hay evidencia para rechazar la hipotesis nula, y se acepta if alpha <= pvalue: print("El test acepta la hipotesis nula.") else: print("El test rechaza la hipótesis nula") # Alternativa: Ordena las muestras de manera ascendente y obtiene el estadistico # y lo compara con el valor limite obtenido de la tabla de Kolgomorov-Smirnov, # Si el estadistico es <= valor limite, aceptamos la hipotesis '''
def norm_ks(rvs): return stats.kstest(rvs, 'norm')
def check_distribution(kin, temp, ndof, kb=8.314e-3, verbosity=2, screen=False, filename=None, ene_unit=None, temp_unit=None): r""" Checks if a kinetic energy trajectory is Maxwell-Boltzmann distributed. .. warning: This is a low-level function. Additionally to being less user-friendly, there is a higher probability of erroneous and / or badly documented behavior due to unexpected inputs. Consider using the high-level version based on the SimulationData object. See physical_validation.kinetic_energy.check_mb_ensemble for more information and full documentation. Parameters ---------- kin : array-like Kinetic energy snapshots of the system. temp : float Target temperature of the system. Used to construct the Maxwell-Boltzmann distribution. ndof : float Number of degrees of freedom in the system. Used to construct the Maxwell-Boltzmann distribution. kb : float Boltzmann constant :math:`k_B`. Default: 8.314e-3 (kJ/mol). verbosity : int 0: Silent. 1: Print minimal information. 2: Print result details. 3: Print additional information. Default: 2. screen : bool Plot distributions on screen. Default: False. filename : string Plot distributions to `filename`.pdf. Default: None. ene_unit : string Energy unit - used for output only. temp_unit : string Temperature unit - used for output only. Returns ------- result : float The p value of the test. See Also -------- physical_validation.kinetic_energy.distribution : High-level version """ # Discard burn-in period and time-correlated frames kin = trajectory.prepare(kin, verbosity=verbosity, name='Kinetic energy') kt = kb * temp if ndof <= 0: warnings.warn('Zero degrees of freedom!') p = np.float('NaN') else: d, p = stats.kstest(kin, 'gamma', (ndof/2, 0, kt)) # ====================== # # Plot to screen or file # # ====================== # do_plot = screen or filename is not None if do_plot: ana_dist = stats.gamma(ndof/2, scale=kt) ana_kin = np.linspace(ana_dist.ppf(0.0001), ana_dist.ppf(0.9999), 200) ana_hist = ana_dist.pdf(ana_kin) tunit = '' if temp_unit is not None: tunit = temp_unit data = [{'y': kin, 'hist': int(len(kin)/150), 'args': dict(label='Trajectory', density=True, alpha=0.5)}] if ndof > 0: data.append( {'x': ana_kin, 'y': ana_hist, 'args': dict(label='Analytical T=' + str(temp) + tunit, lw=5)}) unit = '' if ene_unit is not None: unit = ' [' + ene_unit + ']' plot.plot(data, legend='lower left', title='Kinetic energy distribution', xlabel='Kinetic energy' + unit, ylabel='Probability [%]', sci_x=True, percent=True, filename=filename, screen=screen) if verbosity > 0: if verbosity > 1: message = ('Kinetic energy distribution check (strict)\n' 'Kolmogorov-Smirnov test result: p = {:g}\n' 'Null hypothesis: Kinetic energy is Maxwell-Boltzmann distributed'.format(p)) else: message = 'p = {:g}'.format(p) print(message) return p
max_lik_em[j] = l[-1] alpha_em[j] = m.alpha beta_em[j] = m.beta mu_em[j] = m.mu mu_prime_em[j] = m.mu_prime phi_em[j] = m.phi phi_prime_em[j] = m.phi_prime ## Calculate p-values m = meg.meg_model(G[j], tau_zero=True, full_links=True, verbose=False, discrete=False, force_square=True, evaluate_directed=False) m.specification(main_effects=True, interactions=False, poisson_me=False, poisson_int=False, hawkes_me=True, hawkes_int=True, D=1, verbose=False) ## Parameter values m.alpha = alpha_em[j]; m.beta = beta_em[j]; m.mu = mu_em[j]; m.mu_prime = mu_prime_em[j]; m.phi = phi_em[j]; m.phi_prime = phi_prime_em[j] ## P-value calculations m.pvalues() pp = [p for x in m.pvals_train.values() for p in list(x)] ks_score_em += [stats.kstest(pp, 'uniform')[0]] ks_pval_em += [stats.kstest(pp, 'uniform')[1]] ## Repeat for Adam (gradient ascent) max_lik_ga[j] = -1e100 for s in range(nrep): ## Set seed for *same* initialisation np.random.seed(seeds[s]) ## Initialise the parameter values m.alpha = np.random.uniform(low=0, high=1, size=m.n) m.beta = np.random.uniform(low=0, high=1, size=m.n) m.mu = np.random.uniform(low=0.1, high=1, size=m.n) m.mu_prime = np.random.uniform(low=0.1, high=1, size=m.n) m.phi = np.random.uniform(low=0.1, high=1, size=m.n) m.phi_prime = np.random.uniform(low=0.1, high=1, size=m.n) ## Optimise using EM l = m.optimise_meg(prior_penalisation=False, learning_rate=5e-2, method='adam', max_iter=250, verbose=False, tolerance=1e-6, iter_print=False)
angle = np.arctan2(ndjj[:, 1], ndjj[:, 0]) / np.pi angle = np.round(angle * NDIRS).astype(int) % NDIRS # 4. ver qual histograma de direcção "é" uniforme RULE = 'dmax' min_dir = 0 min_g = np.inf for dir in range(NDIRS): udjj = np.sqrt(np.sum(djj[angle == dir]**2, 1)) if RULE == 'gini': g = gini(udjj) elif RULE == 'ks': H = np.histogram(udjj, NBINS, (0, D))[0] H = H / H.sum() g = kstest(H, 'uniform')[0] elif RULE == 'dmax': g = dmax(udjj) print(dir, g) if g < min_g: min_g = g min_dir = dir if NDIRS == 4: sides = ['-', '/', '|', '\\'] else: sides = [str(i) for i in range(NDIRS)] print('uniform side: %s' % sides[min_dir]) plt.scatter(X[:, 0], X[:, 1], 1, 'black')
import pandas from scipy import stats df = pandas.read_csv('experiment.csv', sep=',') print(df.describe()) print() print(stats.kstest('norm', 'norm', N=3)) print(stats.kstest('norm', 'norm', N=500)) print(stats.kstest(df, 'norm')) print()
# b. daily_cov = factor.cov() print('daily covariance matrix of factor data: ') print(daily_cov) daily_corr = factor.corr() print('daily correlation matrix of factor data: ') print(round(daily_corr), 2) # c. rolling_coef = rolling_coef(factor, 90) print(rolling_coef) # d. distribution test test_result = {} for c in factor.columns: test_stat = kstest(factor[c], 'norm') test_result[c] = test_stat.pvalue print('The result of KS test') print(round(pd.DataFrame.from_dict(test_stat), 2)) # e. Beta ticker_list = [ 'SPY', 'XLB', 'XLE', 'XLF', 'XLI', 'XLK', 'XLP', 'XLU', 'XLV', 'XLY' ] tickers = reduce(lambda x, y: x + y, [ticker + ' ' for ticker in ticker_list]) ETFs = ETF(tickers) price_data = ETFs.get_price_data("2010-01-01", "2019-08-01") ETF_return = ETFs.cal_return_data() beta = ETF_return.apply(lambda x: LinReg(factor, x, return_value='beta'))
for col_name in value_cols: w,pvalue=stats.shapiro(df[col_name]) label='' if pvalue<0.005: label ='***' elif pvalue< 0.01: label = '**' elif pvalue< 0.05: label = '*' print(f" {label}属性 {col_name} 统计量={w} p值={pvalue}") # Kolmogorov-Smirnov检验 # Perform the Kolmogorov-Smirnov test for goodness of fit. # # This performs a test of the distribution F(x) of an observed random variable against a given distribution G(x). Under the null hypothesis the two distributions are identical, F(x)=G(x). The alternative hypothesis can be either ‘two-sided’ (default), ‘less’ or ‘greater’. The KS test is only valid for continuous distributions. print("== Kolmogorov-Smirnov检验 ==") for col_name in value_cols: D, pvalue =stats.kstest(df[col_name],'norm') label='' if pvalue<0.005: label ='***' elif pvalue< 0.01: label = '**' elif pvalue< 0.05: label = '*' print(f" {label}属性 {col_name} 统计量={D} p值={pvalue}")
def stddev(sample, mean): diffs = [] for num in sample: diffs.append((num - mean)**2) variance = sum(diffs) / len(diffs) return variance**0.5 relative_std = stddev(relative_delays, relative_mean) print("calculated delayed mean is %s" % (relative_mean)) print("calculated delayed standard deviation is %s" % (relative_std)) _, p = st.kstest(relative_delays, 'norm', (relative_mean, relative_std)) print( "calculated p-value from normal distribution with calculated delayed mean and standard deviation is %s (no rejection)" % (p)) xs = np.arange(-0.2, 0.8, 0.01) ys = st.norm.pdf(xs, loc=relative_mean, scale=relative_std) plt.hist(relative_delays, 20, density=True, label="percentage of trains delayed per station") plt.plot( xs, ys, label=
vals = [normal_vals, dual_vals] pdf = [normal_pdf, dual_pdf] xlims = [(-4, 4), (-4, 10)] #------------------------------------------------------------ # Compute the statistics and plot the results fig = plt.figure(figsize=(5, 7)) fig.subplots_adjust(left=0.13, right=0.95, bottom=0.06, top=0.95, hspace=0.1) for i in range(2): ax = fig.add_subplot(2, 1, 1 + i) # 2 x 1 subplot # compute some statistics A2, sig, crit = stats.anderson(vals[i]) D, pD = stats.kstest(vals[i], "norm") W, pW = stats.shapiro(vals[i]) mu, sigma = mean_sigma(vals[i], ddof=1) median, sigmaG = median_sigmaG(vals[i]) N = len(vals[i]) Z1 = 1.3 * abs(mu - median) / sigma * np.sqrt(N) Z2 = 1.1 * abs(sigma / sigmaG - 1) * np.sqrt(N) print(70 * '_') print(" Kolmogorov-Smirnov test: D = %.2g p = %.2g" % (D, pD)) print(" Anderson-Darling test: A^2 = %.2g" % A2) print(" significance | critical value ") print(" --------------|----------------") for j in range(len(sig)):
sim.run(10) comp_var = np.asarray(proj.getWeights(format='array')) shape = np.copy(comp_var.shape) connected = np.where(~np.isnan(comp_var)) comp_var = comp_var[connected] num_active = comp_var.size sim.end() assert num_active == n_neurons assert np.all(connected[0] == connected[1]) from scipy import stats scale = dist_params['high'] - dist_params['low'] s, p = stats.kstest((comp_var - dist_params['low']) / scale, 'uniform') assert p > 0.05 v_min = comp_var.min() v_max = comp_var.max() v_avg = comp_var.mean() print(f"Stats for sampled {var} = {v_min}, {v_avg}, {v_max}") half_range = dist_params['low'] + \ (dist_params['high'] - dist_params['low']) / 2. print(f"Stats for ideal {var} = {dist_params['low']}, " f"{half_range}, {dist_params['high']}") epsilon = 0.1 assert np.abs(v_min - dist_params['low']) < epsilon assert np.abs(v_max - dist_params['high']) < epsilon
def test_beta_ellipticity(): from skypy.galaxy.ellipticity import beta_ellipticity # Initialise a randomised ellipticity distribution, an equivalent beta # distribution and special cases where the ellipticity distribution is # equivalent to a uniform distribution and an arcsine distribution. a, b = np.random.lognormal(size=2) args = (a / (a + b), a + b) beta_dist = stats.beta(a, b) ellipticity_dist = beta_ellipticity(*args) ellipticity_uniform = beta_ellipticity(0.5, 2.0) ellipticity_arcsine = beta_ellipticity(0.5, 1.0) # Range of input values spanning the support of the distributions x = np.linspace(0, 1, 100) # Check basic properties of distribution implementation check_normalization(beta_ellipticity, args, 'beta_ellipticity') check_edge_support(beta_ellipticity, args) check_random_state_property(beta_ellipticity, args) check_pickling(beta_ellipticity, args) # Check distribution moments m, v, s, k = ellipticity_dist.stats(moments='mvsk') check_mean_expect(beta_ellipticity, args, m, 'beta_ellipticity') check_var_expect(beta_ellipticity, args, m, v, 'beta_ellipticity') check_skew_expect(beta_ellipticity, args, m, v, s, 'beta_ellipticity') check_kurt_expect(beta_ellipticity, args, m, v, k, 'beta_ellipticity') check_moment(beta_ellipticity, args, m, v, 'beta_ellipticity') # Compare ellipticity distribution functions (e.g. pdf, cdf...) against # functions for an equivalent beta distribution assert allclose(ellipticity_dist.pdf(x), beta_dist.pdf(x)) assert allclose(ellipticity_dist.logpdf(x), beta_dist.logpdf(x)) assert allclose(ellipticity_dist.cdf(x), beta_dist.cdf(x)) assert allclose(ellipticity_dist.logcdf(x), beta_dist.logcdf(x)) assert allclose(ellipticity_dist.ppf(x), beta_dist.ppf(x)) assert allclose(ellipticity_dist.sf(x), beta_dist.sf(x)) assert allclose(ellipticity_dist.logsf(x), beta_dist.logsf(x)) assert allclose(ellipticity_dist.isf(x), beta_dist.isf(x)) assert isclose(ellipticity_dist.entropy(), beta_dist.entropy()) assert isclose(ellipticity_dist.median(), beta_dist.median()) assert isclose(ellipticity_dist.std(), beta_dist.std()) assert allclose(ellipticity_dist.interval(x), beta_dist.interval(x)) # Test scalar output assert np.isscalar(ellipticity_dist.rvs()) # Test array output assert ellipticity_dist.rvs(size=10).shape == (10,) # Test broadcast output e_ratio = 0.5 * np.ones((13, 1, 5)) e_sum = 0.5 * np.ones((7, 5)) rvs = beta_ellipticity.rvs(e_ratio, e_sum) assert rvs.shape == np.broadcast(e_ratio, e_sum).shape # Kolmogorov-Smirnov test comparing ellipticity and beta distributions D, p = stats.kstest(ellipticity_dist.rvs, beta_dist.cdf, N=1000) assert p > 0.01, 'D = {}, p = {}'.format(D, p) # Kolmogorov-Smirnov test comparing ellipticity and uniform distributions D, p = stats.kstest(ellipticity_uniform.rvs, 'uniform', N=1000) assert p > 0.01, 'D = {}, p = {}'.format(D, p) # Kolmogorov-Smirnov test comparing ellipticity and arcsine distributions D, p = stats.kstest(ellipticity_arcsine.rvs, 'arcsine', N=1000) assert p > 0.01, 'D = {}, p = {}'.format(D, p)
predictions = [i[0] for i in model.predict(tf.compat.v1.data.make_one_shot_iterator(dataset_test))] test_rmse = mean_squared_error(y_test[:len(predictions)], predictions, squared= False) test_mae = mean_absolute_error(y_test[:len(predictions)], predictions) print("Test data MAE: ", test_mae, "\n") test_mse = mean_squared_error(y_test[:len(predictions)], predictions, squared= False) test_rmse = np.sqrt(test_mse) print("Test data RMSE: ", test_rmse, "\n") test_r2 = r2_score(y_test[:len(predictions)], predictions) print("Test data R2: ", test_r2, "\n") kstest_results_actual_values = stats.kstest(y_test, 'norm') print("kstest_results_actual_values", kstest_results_actual_values) kstest_results_predicted_values = stats.kstest(predictions, 'norm') print("kstest_results_predicted_values", kstest_results_predicted_values) P_VALUE = 0.05 if kstest_results_predicted_values.pvalue <= P_VALUE or kstest_results_predicted_values.pvalue <= P_VALUE: print(stats.kruskal(y_test, predictions)) print(stats.mannwhitneyu(y_test, predictions)) else: levene_results = stats.levene(y_test, predictions) print("levene_results", levene_results) print(stats.ttest_ind(y_test, predictions, equal_var= levene_results.pvalue > P_VALUE)) plt.plot(history.history['loss'])
def distribution_compared_group(data, sample_size=10000, reports_path='.'): ''' This function works to compare the degree distribution at the individual and group mean level to a number of standard distribution: Uniform, Normal, Gamma, Exponential, Poisson, Triangular, LogNormal, and Weibull. Further, the comparison is made using a one-way Kolmogorov-Smirnov test for goodness of fit and a p-value is computed at the individual and group level. params: sample size = number of random values to generate per distribution test--default val = 10000 data = list of degree distributions ''' from scipy import stats from scipy.stats import uniform, norm, gamma, expon, poisson, triang, lognorm, weibull_min import numpy as np from numpy.random import weibull import os import statistics ##################################### # Group level comparison group_deg_mean = [] for jj in data['Degree_dist']: group_deg_mean.append(statistics.mean(jj)) group_deg_array = np.array(group_deg_mean) group_centered_deg_mean = group_deg_array - np.mean(group_deg_array) data_uniform = uniform.rvs(size=sample_size, loc=np.min(group_centered_deg_mean), scale=np.max(group_centered_deg_mean)) data_normal = norm.rvs(size=sample_size, loc=np.median(group_centered_deg_mean), scale=1) data_gamma = gamma.rvs(a=5, size=sample_size) data_expon = expon.rvs(scale=1, loc=np.median(group_centered_deg_mean), size=sample_size) mu = 3 data_poisson = poisson.rvs(mu=mu, size=sample_size) c = 0.158 data_triang = triang.rvs(c=c, size=sample_size) s = 0.954 data_lognorm = lognorm.rvs(s=s, size=sample_size) a = 5.0 data_weibull = np.random.weibull(a=a, size=sample_size) Distributions = [ data_uniform, data_normal, data_gamma, data_expon, data_poisson, data_triang, data_lognorm, data_weibull ] dist_names = [ 'Uniform', 'Normal', 'Gamma', 'Exponential', 'Poisson', 'Triangular', 'LogNormal', 'Weibull' ] group_uniform = [ 'Uniform:', stats.kstest(group_centered_deg_mean, data_uniform) ] group_normal = [ 'Normal', stats.kstest(group_centered_deg_mean, data_normal) ] group_gamma = ['Gamma', stats.kstest(group_centered_deg_mean, data_gamma)] group_exponential = [ 'Exponential', stats.kstest(group_centered_deg_mean, data_expon) ] group_poisson = [ 'Poisson', stats.kstest(group_centered_deg_mean, data_poisson) ] group_traing = [ 'Triang', stats.kstest(group_centered_deg_mean, data_triang) ] group_lognormal = [ 'Lognormal', stats.kstest(group_centered_deg_mean, data_lognorm) ] group_weibull = [ 'Weibull', stats.kstest(group_centered_deg_mean, data_weibull) ] for ii, jj in zip(Distributions, dist_names): sns.set_context('talk') ax = sns.histplot(ii, bins=50, kde=True, color='skyblue') #hist_kws={'linewidth':15, 'alpha':1}) ax.set(title=jj + ' Sample Degree Distribution', ylabel='Frequency') aj = sns.histplot(group_centered_deg_mean, bins=50, kde=True, color='forestgreen') plt.savefig(reports_path + '/' + jj + ' Sample Degree Distribution.png') plt.clf() ks_test_results_group = [ group_uniform, group_normal, group_gamma, group_exponential, group_poisson, group_traing, group_lognormal, group_weibull ] with open(reports_path + '/' + 'ks_test_results_group.txt', 'w') as filehandle: for listitem in ks_test_results_group: filehandle.write('%s\n' % listitem) return ks_test_results_group
np.random.seed(seed=2015) beta = stats.beta(a=4, b=2) print("method 2:") print(beta.rvs(size=10)) norm_dist = stats.norm(loc=0.5, scale=2) n = 200 dat = norm_dist.rvs(size=n) print("mean of data is: " + str(np.mean(dat))) print("median of data is: " + str(np.median(dat))) print("standard deviation of data is: " + str(np.std(dat))) mu = np.mean(dat) sigma = np.std(dat) stat_val, p_val = stats.kstest(dat, 'norm', (mu, sigma)) print('KS-statistic D = %6.3f p-value = %6.4f' % (stat_val, p_val)) stat_val, p_val = stats.ttest_1samp(dat, 0) print('One-sample t-statistic D = %6.3f, p-value = %6.4f' % (stat_val, p_val)) norm_dist2 = stats.norm(loc=-0.2, scale=1.2) dat2 = norm_dist2.rvs(size=50) stat_val, p_val = stats.ttest_ind(dat, dat2, equal_var=False) print('Two-sample t-statistic D = %6.3f, p-value = %6.4f' % (stat_val, p_val)) g_dist = stats.gamma(a=2) print("quantiles of 2, 4 and 5:") print(g_dist.cdf([2, 4, 5])) print("Values of 25%, 50% and 90%:") print(g_dist.pdf([0.25, 0.5, 0.95])) print(stats.norm.moment(6, loc=0, scale=1))
time_deltas = time_deltas[(0 <= time_deltas) & (time_deltas <= 150)] fig = sns.distplot(time_deltas, kde=False, fit=stats.gamma) plt.title('Time Elapsed Between Incident and Claim') plt.xlabel('Days') plt.ylabel('Density') (a, loc, scale) = stats.gamma.fit(time_deltas) plt.legend([ 'Gamma dist. fit (a={0:.2f}, loc={1:.2f}, scale={2:.2f})'.format( a, loc, scale), 'Time Elapsed' ]) plt.savefig('../plots/wait_time.png', dpi=400, bbox_inches='tight') plt.close() (D, p) = stats.kstest(time_deltas, 'gamma', args=(a, loc, scale)) payments = data[['Claim Number', 'Claim Amount', 'Close Amount', 'Status']].copy() payments['Fraction'] = data['Close Amount'] / data['Claim Amount'] payments['Status'] = payments['Status'].str.replace('^Closed[\d\D]*', 'Closed') payments['Status'] = payments['Status'].str.replace('^Insufficient[\d\D]*$', 'Insufficient') payments['Status'] = payments['Status'].str.replace('[\d\D]*assigned[\d\D]*', 'Assigned') payments['Status'] = payments['Status'].str.replace('^Pending[\d\D]*$', 'Pending') fig = sns.countplot(y='Status', data=payments) plt.title('Status of Claims') plt.xlabel('Count')
# delete last column that is all 0 wave_coef_matrix = np.delete(wave_coef_matrix, -1, axis=1) ## to train on training set only wave_coef_matrix_training = wave_coef_matrix[training_index,:] wave_coef_matrix_testig = wave_coef_matrix[testing_index,:] ## from wave_coef_matrix choose 10 columns least likely to be random # using Kolmogorov-Smirnov (KS) test # http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.653.6292&rep=rep1&type=pdf all_feature_test_result = [] for ii in range(0, wave_coef_matrix_training.shape[1]): feature = wave_coef_matrix_training[:,ii].astype(float) # run KS test on this feature # https://stackoverflow.com/questions/17901112/using-scipys-stats-kstest-module-for-goodness-of-fit-testing test_result = stats.kstest(feature, 'norm', args=(np.mean(feature), np.std(feature))) # test_result 0: test static 1: p value all_feature_test_result.append(test_result[0]) all_feature_test_result = np.asarray(all_feature_test_result) ## pick the n largest p value column as selected features to build the matrix # selected_feature_index = all_feature_test_result.argsort()[-10:][::1] feature_num = 25 selected_feature_index = all_feature_test_result.argsort()[:feature_num] ## do PCA analysis after wavelet anaylis ## inital 8 PC ## plot top 2 PCA component in 2d graph #pca_a = PCA(n_components=8) #pca_a.fit(wave_coef_matrix) #eigen_a=pca_a.explained_variance_ratio_
print(std_steps_regular) print(std_level_zchaff_irregular) print(std_level_zchaff_regular) print(std_num_decisions_zchaff_irregular) print(std_num_decisions_zchaff_regular) print(std_avg_length_trials_walksat_irregular) print(std_avg_length_trials_walksat_regular) #NORMALITY TESTS KOLMOGOROV SMIRNOV print( stats.kstest( (pycosat_irregular_metrics[6] - np.mean(pycosat_irregular_metrics[6])) / np.std(pycosat_irregular_metrics[6]), cdf='norm', N=15000)) print( stats.kstest( (pycosat_regular_metrics[6] - np.mean(pycosat_regular_metrics[6])) / np.std(pycosat_regular_metrics[6]), cdf='norm', N=15000)) print( stats.kstest( (pycosat_irregular_metrics[1] - np.mean(pycosat_irregular_metrics[1])) / np.std(pycosat_irregular_metrics[1]), cdf='norm',
float( re.sub( ',', '.', re.sub('[^\w.,]', '', final_df['div_per_100_marriges_'].loc[ix]))) if len(re.findall('\w', str(final_df['div_per_100_marriges_'].loc[ix]))) > 0 else 0 for ix in final_df.index ] final_df['div_per_100_marriges_'] = temp #imputing other variables: for var in final_df: n = len(final_df[final_df[var] == 0]) if n != 0: if kstest(final_df[var].values, 'norm', N=len(final_df))[1] > 0.05: change = [np.mean(final_df[var])] * n final_df[var][final_df[var] == 0] = change else: change = [np.median(final_df[var])] * n final_df[var][final_df[var] == 0] = change min_year = final_df.groupby('year')['All ages (%)'].mean().idxmin() max_year = final_df.groupby('year')['All ages (%)'].mean().idxmax() max_ = final_df[final_df.year == max_year] min_ = final_df[final_df.year == min_year] overall_corr = final_df.drop(columns=['year', 'country']).corr() min_corr = min_.drop(columns=['year', 'country']).corr()
axis=1).tolist()) list_df_pp.append( pd.Series([pair in set_pp_cell for pair in list_pp], index=list_pp)) set_po_cell = set( df_po_cell.apply(lambda x: (x['region1'], x['region2']), axis=1).tolist()) list_df_po.append( pd.Series([pair in set_po_cell for pair in list_po], index=list_po)) df_pp = pd.concat(list_df_pp, axis=1) df_po = pd.concat(list_df_po, axis=1) df_cell = pd.concat([df_pp, df_po], axis=0) df_cell.columns = list_cell # ks test df_pval = pd.DataFrame(np.full(shape=(len(list_cell), len(celltypes)), fill_value=1), index=list_cell, columns=celltypes) for cell in list_cell: cell_pair = df_cell.index[df_cell[cell]] for celltype in celltypes: all_score = dict_cell_scores[celltype] cell_score = all_score[cell_pair] df_pval.loc[cell, celltype] = \ kstest(np.array(cell_score), np.array(all_score), alternative='less')[1] file_pvals = os.path.join(path_cell_interatome, 'interactome_pvals.txt') df_pval.to_csv(file_pvals, sep='\t')
def unit_stability(units_b, units=None, feat_names=['amps'], dist='norm', test='ks'): ''' Computes the probability that the empirical spike feature distribution(s), for specified feature(s), for all units, comes from a specific theoretical distribution, based on a specified statistical test. Also computes the coefficients of variation of the spike feature(s) for all units. Parameters ---------- units_b : bunch A units bunch containing fields with spike information (e.g. cluster IDs, times, features, etc.) for all units. units : array-like (optional) A subset of all units for which to create the bar plot. (If `None`, all units are used) feat_names : list of strings (optional) A list of names of spike features that can be found in `spks` to specify which features to use for calculating unit stability. dist : string (optional) The type of hypothetical null distribution for which the empirical spike feature distributions are presumed to belong to. test : string (optional) The statistical test used to compute the probability that the empirical spike feature distributions come from `dist`. Returns ------- p_vals_b : bunch A bunch with `feat_names` as keys, containing a ndarray with p-values (the probabilities that the empirical spike feature distribution for each unit comes from `dist` based on `test`) for each unit for all `feat_names`. cv_b : bunch A bunch with `feat_names` as keys, containing a ndarray with the coefficients of variation of each unit's empirical spike feature distribution for all features. See Also -------- plot.feat_vars Examples -------- 1) Compute 1) the p-values obtained from running a one-sample ks test on the spike amplitudes for each unit, and 2) the variances of the empirical spike amplitudes distribution for each unit. Create a histogram of the variances of the spike amplitudes for each unit, color-coded by depth of channel of max amplitudes. Get cluster IDs of those units which have variances greater than 50. >>> p_vals_b, variances_b = bb.metrics.unit_stability(units_b) # Plot histograms of variances color-coded by depth of channel of max amplitudes >>> fig = bb.plot.feat_vars(units_b, feat_name='amps') # Get all unit IDs which have amps variance > 50 >>> var_vals = np.array(tuple(variances_b['amps'].values())) >>> bad_units = np.where(var_vals > 50) ''' # Get units. if not(units is None): # we're using a subset of all units unit_list = list(units_b[feat_names[0]].keys()) # for each `feat` and unit in `unit_list`, remove unit from `units_b` if not in `units` for feat in feat_names: [units_b[feat].pop(unit) for unit in unit_list if not(int(unit) in units)] unit_list = list(units_b[feat_names[0]].keys()) # get new `unit_list` after removing units # Initialize `p_vals` and `variances`. p_vals_b = bb.core.Bunch() cv_b = bb.core.Bunch() # Set the test as a lambda function (in future, more tests can be added to this dict) tests = \ { 'ks': lambda x, y: stats.kstest(x, y) } test_fun = tests[test] # Compute the statistical tests and variances. For each feature, iteratively get each unit's # p-values and variances, and add them as keys to the respective bunches `p_vals_feat` and # `variances_feat`. After iterating through all units, add these bunches as keys to their # respective parent bunches, `p_vals` and `variances`. for feat in feat_names: p_vals_feat = bb.core.Bunch((unit, 0) for unit in unit_list) cv_feat = bb.core.Bunch((unit, 0) for unit in unit_list) for unit in unit_list: # If we're missing units/features, create a NaN placeholder and skip them: if len(units_b['times'][str(unit)]) == 0: p_val = np.nan cv = np.nan else: # compute p_val and var for current feature _, p_val = test_fun(units_b[feat][unit], dist) cv = np.var(units_b[feat][unit]) / np.mean(units_b[feat][unit]) # Append current unit's values to list of units' values for current feature: p_vals_feat[str(unit)] = p_val cv_feat[str(unit)] = cv p_vals_b[feat] = p_vals_feat cv_b[feat] = cv_feat return p_vals_b, cv_b
elif distname in right: sm = rvs.mean() sstd = np.sqrt(rvs.var()) par_est = tuple(distfn.fit(rvs, loc=0, scale=1)) else: sm = rvs.mean() sstd = np.sqrt(rvs.var()) par_est = tuple(distfn.fit(rvs, loc=sm, scale=sstd)) print('fit', par_est) arg_est = par_est[:-2] loc_est = par_est[-2] scale_est = par_est[-1] rvs_normed = (rvs - loc_est) / scale_est ks_stat, ks_pval = stats.kstest(rvs_normed, distname, arg_est) print('kstest', ks_stat, ks_pval) quant = 0.1 crit = distfn.ppf(1 - quant * float(rind), loc=loc_est, scale=scale_est, *par_est) tail_prob = stats.t.sf(crit, dgp_arg, scale=dgp_scale) print('crit, prob', quant, crit, tail_prob) #if distname == 'norm': #plothist(rvs,loc_est,scale_est) #args = tuple() results.append([ distname, ks_stat, ks_pval, arg_est, loc_est, scale_est, crit, tail_prob ])
print("#,FY,avg, v, p-value") arpv = np.zeros((200, 3)) #一標本KS検定の結果を格納する for i in range(n_topics): for j in range(3): s = 0 s2 = 0 for k in range(ndata): s += ar[i][j][k] s2 += ar[i][j][k]**2 # 平均 avg = s / ndata # 標準偏差 v = math.sqrt(((s2 - avg**2 * ndata) / (ndata - 1))) pv = stats.kstest(ar[i][j], stats.norm(loc = avg, scale=v).cdf) if (pv[1] < 0.05): # 正規分布 print("{},{},{:.6f},{:.6f},{:.6f},パラメトリック".format(i, j+2015, avg, v, pv[1])) else: print("{},{},{:.6f},{:.6f},{:.6f},ノンパラメトリック".format(i, j+2015, avg, v, pv[1])) arpv[i][j] = pv[1] print("一標本KS検定結果") print("#,15,16,17") for i in range(n_topics): print(f"{i},{arpv[i][0]},{arpv[i][1]},{arpv[i][2]}") print("二標本KS検定") print("#,15vs16,16vs17") for i in range(n_topics): # 2015 vs 2016 # 2016 vs 2017
def fit_summary_plot(df, dfo=None, show=True, bins=15, select=True, **kwargs): if dfo is None: dfo = load_sample(select=select, **kwargs) if select: df = df[df["selected_jig"] == 1].reset_index(drop=True) fig = plt.figure(figsize=(6, 6)) ax0 = plt.subplot(3, 1, 1) histkwargs = dict(density=True, histtype="step") rng = (min(dfo['mueff_av'].min(), df['uae_obs_jig'].min()), max(dfo['mueff_av'].max(), df['uae_obs_jig'].max())) ax0.hist(dfo['mueff_av'].values, color="k", range=rng, bins=bins, label="obs", **histkwargs) ax0.hist(df['uae_obs_jig'].values, color="b", range=rng, bins=bins, label="model", **histkwargs) ks = kstest(dfo['mueff_av'].values, df['uae_obs_jig'].values)[1] ax0.legend(loc="best") ax0.set_xlabel(f"uae (KS pval={ks:.2f})") ax1 = plt.subplot(3, 1, 2) rng = (min(dfo['rec_arcsec'].min(), df['rec_obs_jig'].min()), max(dfo['rec_arcsec'].max(), df['rec_obs_jig'].max())) ax1.hist(dfo['rec_arcsec'].values, color="k", range=rng, bins=bins, **histkwargs, label="obs") ax1.hist(df['rec_obs_jig'].values, color="b", range=rng, bins=bins, **histkwargs, label="model") ks = kstest(dfo['rec_arcsec'].values, df['rec_obs_jig'].values)[1] ax1.legend(loc="best") ax1.set_xlabel(f"rec (KS pval={ks:.2f})") ax1 = plt.subplot(3, 1, 3) rng = (min(dfo['g_r'].min(), df['colour_obs'].min()), max(dfo['g_r'].max(), df['colour_obs'].max())) ax1.hist(dfo['g_r'].values, color="k", range=rng, bins=bins, **histkwargs, label="obs") ax1.hist(df['colour_obs'].values, color="b", range=rng, bins=bins, **histkwargs, label="model") ks = kstest(dfo['g_r'].values, df['colour_obs'].values)[1] ax1.legend(loc="best") ax1.set_xlabel(f"g-r (KS pval={ks:.2f})") plt.tight_layout() if show: plt.show(block=False) return fig
#################################################################### ###################### Numbers from C++ Tests ###################### #################################################################### cPRNGs = np.loadtxt("prngNums.txt") mean = "The Mean = " + str('%.4f' % np.mean(cPRNGs)) variance = "The Variance = " + str('%.5f' % np.var(cPRNGs)) # Cycle Check first = cPRNGs[0] cycle = "m" count = 0 for i in cPRNGs[1:]: count = count + 1 if i == first: cycle = count break cPRNGs.sort() # Kolmogorov Smirnov Test result = stats.kstest(cPRNGs, 'norm') plt.hist(cPRNGs) plt.title("C++ STD LCG Test") plt.xlabel("Value\n\n" + mean + "\n" + variance + "\n" + str(result) + "\n\nFigure 1: C++ Test") plt.ylabel("Frequency") plt.savefig('histc++1.png', bbox_inches='tight') plt.show()
_, t, p, a, m, _, _, _ = simulate( rng, N, 0, 1, 1, lambda rng: rng.exponential(0.1) ) T[i, :] = t P[i, :] = p A[i, :] = a M[i, :] = m Tl = T[:, -1] Pl = P[:, -1] Al = A[:, -1] Ml = M[:, -1] # %% Display simulation plt.figure(1, clear=True) plt.hist(Pl) print(st.kstest(Pl, "norm")) plt.title( f"$P_{{{N}}}, p = {st.kstest((Pl-np.mean(Pl))/np.std(Pl), 'norm')[1]:.4}$" ) plt.figure(2, clear=True) plt.hist(Tl) plt.title( f"$T_{{{N}}}, p = {st.kstest((Tl-np.mean(Tl))/np.std(Tl), 'norm')[1]:.4}$" ) plt.figure(3, clear=True) plt.hist(Al) plt.title( f"$A_{{{N}}}, p = {st.kstest((Al-np.mean(Al))/np.std(Al), 'norm')[1]:.4}$" ) plt.figure(4, clear=True) plt.hist(Ml)
def _kstest(self, loc, scale, samples): # Uses the Kolmogorov-Smirnov test for goodness of fit. ks, _ = sp_stats.kstest(samples, sp_stats.laplace(loc, scale=scale).cdf) # Return True when the test passes. return ks < 0.02
def test_correct_sampling(sampler_c, model_and_weights, set_pdf_power): sampler = set_pdf_power(sampler_c) hi = sampler.hilbert if isinstance(hi, DiscreteHilbert): n_states = hi.n_states ma, w = model_and_weights(hi, sampler) n_samples = max(40 * n_states, 100) ps = (np.absolute(nk.nn.to_array( hi, ma, w, normalize=False))**sampler.machine_pow) ps /= ps.sum() n_rep = 6 pvalues = np.zeros(n_rep) sampler_state = sampler.init_state(ma, w, seed=SAMPLER_SEED) for jrep in range(n_rep): sampler_state = sampler.reset(ma, w, state=sampler_state) # Burnout phase samples, sampler_state = sampler.sample(ma, w, state=sampler_state, chain_length=n_samples // 100) assert samples.shape == ( n_samples // 100, sampler.n_chains, hi.size, ) samples, sampler_state = sampler.sample(ma, w, state=sampler_state, chain_length=n_samples) assert samples.shape == (n_samples, sampler.n_chains, hi.size) sttn = hi.states_to_numbers( np.asarray(samples.reshape(-1, hi.size))) n_s = sttn.size # fill in the histogram for sampler unique, counts = np.unique(sttn, return_counts=True) hist_samp = np.zeros(n_states) hist_samp[unique] = counts # expected frequencies f_exp = n_s * ps statistics, pvalues[jrep] = chisquare(hist_samp, f_exp=f_exp) s, pval = combine_pvalues(pvalues, method="fisher") assert pval > 0.01 or np.max(pvalues) > 0.01 elif isinstance(hi, ContinuousBoson): ma, w = model_and_weights(hi, sampler) n_samples = 5000 n_discard = 2000 n_rep = 6 pvalues = np.zeros(n_rep) sampler_state = sampler.init_state(ma, w, seed=SAMPLER_SEED) for jrep in range(n_rep): sampler_state = sampler.reset(ma, w, state=sampler_state) # Burnout phase samples, sampler_state = sampler.sample(ma, w, state=sampler_state, chain_length=n_discard) assert samples.shape == ( n_discard, sampler.n_chains, hi.size, ) samples, sampler_state = sampler.sample(ma, w, state=sampler_state, chain_length=n_samples) assert samples.shape == (n_samples, sampler.n_chains, hi.size) samples = samples.reshape(-1, samples.shape[-1]) dist = multivariate_normal( mean=np.zeros(samples.shape[-1]), cov=np.linalg.inv( sampler.machine_pow * np.dot(w["params"]["kernel"].T, w["params"]["kernel"])), ) exact_samples = dist.rvs(size=samples.shape[0]) counts, bins = np.histogramdd(samples, bins=10) counts_exact, _ = np.histogramdd(exact_samples, bins=bins) statistics, pvalues[jrep] = kstest(counts.reshape(-1), counts_exact.reshape(-1)) s, pval = combine_pvalues(pvalues, method="fisher") assert pval > 0.01 or np.max(pvalues) > 0.01
def pareto_ks(loc, rvs): #start_scale = rvs.min() - loc # not used yet est = stats.pareto.fit_fr(rvs, 1., frozen=[np.nan, loc, np.nan]) args = (est[0], loc, est[1]) return stats.kstest(rvs, 'pareto', args)[0]