def test_nc_parameter(self): # Parameter values c<=0 were not enabled (gh-2402). # For negative values c and for c=0 results of rv.cdf(0) below were nan rv = stats.nct(5, 0) assert_equal(rv.cdf(0), 0.5) rv = stats.nct(5, -1) assert_almost_equal(rv.cdf(0), 0.841344746069, decimal=10)
def text_variance_gh_issue_2401(): # Computation of the variance of a non-central t-distribution resulted # in a TypeError: ufunc 'isinf' not supported for the input types, # and the inputs could not be safely coerced to any supported types # according to the casting rule 'safe' rv = stats.nct(4, 0) assert_equal(rv.var(), 2.0)
def initialize_nct_distribution(df, nc): np.seterr(over='ignore', invalid='ignore') test_value = stats.nct.pdf(x=nc, df=df, nc=nc) if np.isnan(test_value): return stats.t(df=df, loc=nc) else: return stats.nct(df=df, nc=nc)
def ttest_ind_sample_size(mu1, mu2, s1, s2, r, power, sig_level=0.05, alternative='two-sided', pooled=True): n1 = 2 # initialisation n2 = n1 * r sim_power = 0 while sim_power < power: n1 += 1 n2 = n1 * r if pooled: dof = n1 + n2 - 2 pooled_var = (((s1**2) * (n1 - 1)) + ((s2**2) * (n2 - 1))) / dof std_error = np.sqrt(pooled_var * (1 / n1 + 1 / n2)) else: var1 = (s1**2) # assuming unbiased sample standard deviation var2 = (s2**2) dof = (var1 / n1 + var2 / n2)**2 dof /= ((((var1 / n1)**2) / (n1 - 1)) + (((var2 / n2)**2) / (n2 - 1))) std_error = np.sqrt(var1 / n1 + var2 / n2) ncp = (mu2 - mu1) / std_error t_null = scs.t(df=dof, loc=0, scale=1) t_alt = scs.nct(df=dof, nc=ncp) if alternative == 'smaller': cv = t_null.ppf(sig_level) sim_power = t_alt.cdf(cv) elif alternative == 'larger': cv = t_null.ppf(1 - sig_level) sim_power = 1 - t_alt.cdf(cv) elif alternative == 'two-sided': cv = [t_null.ppf(sig_level / 2), t_null.ppf(1 - (sig_level / 2))] sim_power = sum([t_alt.cdf(cv[0]), 1 - t_alt.cdf(cv[1])]) print('ncp: ', ncp) print('Critical t: ', cv) print('Actual Power: ', sim_power) return (np.ceil(n1), np.ceil(n2))
def __init__(self, mu, sg, df, nc, lc, sc): self.norm = sps.norm(mu, sg) self.nct = sps.nct(df=df, nc=nc, loc=lc, scale=sc) modeL = nc * np.sqrt(df / (df + 5 / 2)) modeU = nc * np.sqrt(df / (df + 1)) self.nct.modeEst = sc * (modeL + modeU) / 2 + lc self.norm.max = self.norm.pdf(mu) self.nct.max = self.nct.pdf(self.nct.modeEst) self.max = self.norm.max * self.nct.max
def ttest_paired_sample_size(starting_n, effect_size, power=0.8, sig_level=0.05, alternative='two-sided'): n = starting_n # initialisation sim_power = 0 while sim_power < power: n += 1 dof = n - 1 ncp = effect_size * np.sqrt(n) t_null = scs.t(df=dof, loc=0, scale=1) t_alt = scs.nct(df=dof, nc=ncp) if alternative == 'smaller': cv = t_null.ppf(sig_level) sim_power = t_alt.cdf(cv) elif alternative == 'larger': cv = t_null.ppf(1 - sig_level) sim_power = 1 - t_alt.cdf(cv) elif alternative == 'two-sided': cv = [t_null.ppf(sig_level / 2), t_null.ppf(1 - (sig_level / 2))] sim_power = sum([t_alt.cdf(cv[0]), 1 - t_alt.cdf(cv[1])]) print('ncp: ', ncp) print('Critical t: ', cv) print('Actual Power: ', sim_power) return np.ceil(n)
df, nc = 14, 0.24 mean, var, skew, kurt = nct.stats(df, nc, moments='mvsk') # Display the probability density function (``pdf``): x = np.linspace(nct.ppf(0.01, df, nc), nct.ppf(0.99, df, nc), 100) ax.plot(x, nct.pdf(x, df, nc), 'r-', lw=5, alpha=0.6, label='nct pdf') # Alternatively, the distribution object can be called (as a function) # to fix the shape, location and scale parameters. This returns a "frozen" # RV object holding the given parameters fixed. # Freeze the distribution and display the frozen ``pdf``: rv = nct(df, nc) ax.plot(x, rv.pdf(x), 'k-', lw=2, label='frozen pdf') # Check accuracy of ``cdf`` and ``ppf``: vals = nct.ppf([0.001, 0.5, 0.999], df, nc) np.allclose([0.001, 0.5, 0.999], nct.cdf(vals, df, nc)) # True # Generate random numbers: r = nct.rvs(df, nc, size=1000) # And compare the histogram: ax.hist(r, density=True, histtype='stepfilled', alpha=0.2)
c, d = 10.5, 4.3 mean, var, skew, kurt = burr.stats(c, d, moments='mvsk') x = np.linspace(burr.ppf(0.01, c, d), burr.ppf(0.99, c, d), 100) burr.ppf print(stats.norm.__doc__) alpha, loc, beta = b[0], b[1], b[2] pdf = nct.pdf() data = ss.genlogistic.rvs(alpha, loc=loc, scale=beta, size=5000) myHist = plt.hist(distribution_car_price, 500, normed=True) rv = ss.genlogistic(alpha, loc, beta) x = np.linspace(0, 500000) h = plt.plot(x, rv.pdf(x), lw=2) axes = plt.gca() axes.set_xlim([0, 150000]) plt.show() alpha, loc, beta = b[0], b[1], b[2] data = ss.genlogistic.rvs(alpha, loc=loc, scale=beta, size=10000) myHist = plt.hist(distribution_car_price, 500, normed=True) rv = ss.nct(a[0], a[1], a[2], a[3]) x = np.linspace(0, 500000) h = plt.plot(x, rv.pdf(x), lw=2) axes = plt.gca() axes.set_xlim([0, 150000]) plt.show()
9.1193851632305201, 261.3457987967214) drivingduration_model_dict['exponweib'] = st.exponweib(2.6443841639764942, 0.89242254172118096, 10.603640861374947, 40.28556311444698) drivingduration_model_dict['gengamma'] = st.gengamma(4.8743515108339581, 0.61806208678747043, 9.4649293818479716, 5.431576919220225) drivingduration_model_dict['recipinvgauss'] = st.recipinvgauss( 0.499908918842556, 0.78319699707613699, 28.725450197674746) drivingduration_model_dict['f'] = st.f(9.8757694313677113, 12.347442183821462, 0.051160749890587665, 73.072591767722287) carprice_model_dict = ct.OrderedDict() carprice_model_dict['nct'] = st.nct(7.3139456577106312, 3.7415255108348946, -46.285705145385577, 7917.0860181436065) carprice_model_dict['genlogistic'] = st.genlogistic(10.736440967148635, 3735.7049978006107, 10095.421377235754) carprice_model_dict['gumbel_r'] = st.gumbel_r(26995.077239517472, 10774.370808211244) carprice_model_dict['f'] = st.f(24168.523476867485, 35.805656864712923, -21087.314142557225, 51154.0328397044) carprice_model_dict['johnsonsu'] = st.johnsonsu(-1.7479864366935538, 1.8675670208081987, 14796.793096897647, 14716.575397771712)
q = np.linspace(0, 1, 100) e_param = stats.norminvgauss.fit(euro_log) VaR = stats.norminvgauss(param[0], param[1], param[2], param[3]).ppf(q) e_var = stats.norminvgauss(e_param[0], e_param[1], e_param[2], e_param[3]).ppf(q) d, p = stats.kstest(logreturns[2], cdf = 'norminvgauss', args = (param[0], param[1], param[2], param[3])) ks.append(p) #plt.plot(q, VaR, 'r-', label = 'Dogecoin') #plt.plot(q, e_var, 'b--', label = 'Euro') #plt.legend() ################LITECOIN############### param = [1.8693373494312953, -0.08195764686173776, 0.0017432733966738605, 0.021626187586807188] q = np.linspace(0, 1, 100) e_param = stats.nct.fit(euro_log) VaR = stats.nct(param[0], param[1], param[2], param[3]).ppf(q) e_var = stats.nct(e_param[0], e_param[1], e_param[2], e_param[3]).ppf(q) d, p = stats.kstest(logreturns[3], cdf = 'nct', args = (param[0], param[1], param[2], param[3])) ks.append(p) #plt.plot(q, VaR, 'r-', label = 'Litecoin') #plt.plot(q, e_var, 'b--', label = 'Euro') #plt.legend() ################NEXUS################ param = [2.269163744926829, -0.22915037600364918, 0.014003933425093236, 0.048491139607216696] q = np.linspace(0, 1, 100) e_param = stats.nct.fit(euro_log) VaR = stats.nct(param[0], param[1], param[2], param[3]).ppf(q) e_var = stats.nct(e_param[0], e_param[1], e_param[2], e_param[3]).ppf(q) d, p = stats.kstest(logreturns[4], cdf = 'nct', args = (param[0], param[1], param[2], param[3]))
def all_dists(): # dists param were taken from scipy.stats official # documentaion examples # Total - 89 return { "alpha": stats.alpha(a=3.57, loc=0.0, scale=1.0), "anglit": stats.anglit(loc=0.0, scale=1.0), "arcsine": stats.arcsine(loc=0.0, scale=1.0), "beta": stats.beta(a=2.31, b=0.627, loc=0.0, scale=1.0), "betaprime": stats.betaprime(a=5, b=6, loc=0.0, scale=1.0), "bradford": stats.bradford(c=0.299, loc=0.0, scale=1.0), "burr": stats.burr(c=10.5, d=4.3, loc=0.0, scale=1.0), "cauchy": stats.cauchy(loc=0.0, scale=1.0), "chi": stats.chi(df=78, loc=0.0, scale=1.0), "chi2": stats.chi2(df=55, loc=0.0, scale=1.0), "cosine": stats.cosine(loc=0.0, scale=1.0), "dgamma": stats.dgamma(a=1.1, loc=0.0, scale=1.0), "dweibull": stats.dweibull(c=2.07, loc=0.0, scale=1.0), "erlang": stats.erlang(a=2, loc=0.0, scale=1.0), "expon": stats.expon(loc=0.0, scale=1.0), "exponnorm": stats.exponnorm(K=1.5, loc=0.0, scale=1.0), "exponweib": stats.exponweib(a=2.89, c=1.95, loc=0.0, scale=1.0), "exponpow": stats.exponpow(b=2.7, loc=0.0, scale=1.0), "f": stats.f(dfn=29, dfd=18, loc=0.0, scale=1.0), "fatiguelife": stats.fatiguelife(c=29, loc=0.0, scale=1.0), "fisk": stats.fisk(c=3.09, loc=0.0, scale=1.0), "foldcauchy": stats.foldcauchy(c=4.72, loc=0.0, scale=1.0), "foldnorm": stats.foldnorm(c=1.95, loc=0.0, scale=1.0), # "frechet_r": stats.frechet_r(c=1.89, loc=0.0, scale=1.0), # "frechet_l": stats.frechet_l(c=3.63, loc=0.0, scale=1.0), "genlogistic": stats.genlogistic(c=0.412, loc=0.0, scale=1.0), "genpareto": stats.genpareto(c=0.1, loc=0.0, scale=1.0), "gennorm": stats.gennorm(beta=1.3, loc=0.0, scale=1.0), "genexpon": stats.genexpon(a=9.13, b=16.2, c=3.28, loc=0.0, scale=1.0), "genextreme": stats.genextreme(c=-0.1, loc=0.0, scale=1.0), "gausshyper": stats.gausshyper(a=13.8, b=3.12, c=2.51, z=5.18, loc=0.0, scale=1.0), "gamma": stats.gamma(a=1.99, loc=0.0, scale=1.0), "gengamma": stats.gengamma(a=4.42, c=-3.12, loc=0.0, scale=1.0), "genhalflogistic": stats.genhalflogistic(c=0.773, loc=0.0, scale=1.0), "gilbrat": stats.gilbrat(loc=0.0, scale=1.0), "gompertz": stats.gompertz(c=0.947, loc=0.0, scale=1.0), "gumbel_r": stats.gumbel_r(loc=0.0, scale=1.0), "gumbel_l": stats.gumbel_l(loc=0.0, scale=1.0), "halfcauchy": stats.halfcauchy(loc=0.0, scale=1.0), "halflogistic": stats.halflogistic(loc=0.0, scale=1.0), "halfnorm": stats.halfnorm(loc=0.0, scale=1.0), "halfgennorm": stats.halfgennorm(beta=0.675, loc=0.0, scale=1.0), "hypsecant": stats.hypsecant(loc=0.0, scale=1.0), "invgamma": stats.invgamma(a=4.07, loc=0.0, scale=1.0), "invgauss": stats.invgauss(mu=0.145, loc=0.0, scale=1.0), "invweibull": stats.invweibull(c=10.6, loc=0.0, scale=1.0), "johnsonsb": stats.johnsonsb(a=4.32, b=3.18, loc=0.0, scale=1.0), "johnsonsu": stats.johnsonsu(a=2.55, b=2.25, loc=0.0, scale=1.0), "ksone": stats.ksone(n=1e03, loc=0.0, scale=1.0), "kstwobign": stats.kstwobign(loc=0.0, scale=1.0), "laplace": stats.laplace(loc=0.0, scale=1.0), "levy": stats.levy(loc=0.0, scale=1.0), "levy_l": stats.levy_l(loc=0.0, scale=1.0), "levy_stable": stats.levy_stable(alpha=0.357, beta=-0.675, loc=0.0, scale=1.0), "logistic": stats.logistic(loc=0.0, scale=1.0), "loggamma": stats.loggamma(c=0.414, loc=0.0, scale=1.0), "loglaplace": stats.loglaplace(c=3.25, loc=0.0, scale=1.0), "lognorm": stats.lognorm(s=0.954, loc=0.0, scale=1.0), "lomax": stats.lomax(c=1.88, loc=0.0, scale=1.0), "maxwell": stats.maxwell(loc=0.0, scale=1.0), "mielke": stats.mielke(k=10.4, s=3.6, loc=0.0, scale=1.0), "nakagami": stats.nakagami(nu=4.97, loc=0.0, scale=1.0), "ncx2": stats.ncx2(df=21, nc=1.06, loc=0.0, scale=1.0), "ncf": stats.ncf(dfn=27, dfd=27, nc=0.416, loc=0.0, scale=1.0), "nct": stats.nct(df=14, nc=0.24, loc=0.0, scale=1.0), "norm": stats.norm(loc=0.0, scale=1.0), "pareto": stats.pareto(b=2.62, loc=0.0, scale=1.0), "pearson3": stats.pearson3(skew=0.1, loc=0.0, scale=1.0), "powerlaw": stats.powerlaw(a=1.66, loc=0.0, scale=1.0), "powerlognorm": stats.powerlognorm(c=2.14, s=0.446, loc=0.0, scale=1.0), "powernorm": stats.powernorm(c=4.45, loc=0.0, scale=1.0), "rdist": stats.rdist(c=0.9, loc=0.0, scale=1.0), "reciprocal": stats.reciprocal(a=0.00623, b=1.01, loc=0.0, scale=1.0), "rayleigh": stats.rayleigh(loc=0.0, scale=1.0), "rice": stats.rice(b=0.775, loc=0.0, scale=1.0), "recipinvgauss": stats.recipinvgauss(mu=0.63, loc=0.0, scale=1.0), "semicircular": stats.semicircular(loc=0.0, scale=1.0), "t": stats.t(df=2.74, loc=0.0, scale=1.0), "triang": stats.triang(c=0.158, loc=0.0, scale=1.0), "truncexpon": stats.truncexpon(b=4.69, loc=0.0, scale=1.0), "truncnorm": stats.truncnorm(a=0.1, b=2, loc=0.0, scale=1.0), "tukeylambda": stats.tukeylambda(lam=3.13, loc=0.0, scale=1.0), "uniform": stats.uniform(loc=0.0, scale=1.0), "vonmises": stats.vonmises(kappa=3.99, loc=0.0, scale=1.0), "vonmises_line": stats.vonmises_line(kappa=3.99, loc=0.0, scale=1.0), "wald": stats.wald(loc=0.0, scale=1.0), "weibull_min": stats.weibull_min(c=1.79, loc=0.0, scale=1.0), "weibull_max": stats.weibull_max(c=2.87, loc=0.0, scale=1.0), "wrapcauchy": stats.wrapcauchy(c=0.0311, loc=0.0, scale=1.0), }
dist_continu = [ d for d in dir(stats) if isinstance(getattr(stats, d), stats.rv_continuous) ] dist_discrete = [ d for d in dir(stats) if isinstance(getattr(stats, d), stats.rv_discrete) ] print 'number of continuous distributions:', len(dist_continu) print 'number of discrete distributions: ', len(dist_discrete) ##Distributions can be used in one of two ways, either by passing all distribution ##parameters to each method call or by freezing the parameters for the instance ##of the distribution. As an example, we can get the median of the distribution by using ##the percent point function, ppf, which is the inverse of the cdf: print stats.nct.ppf(0.5, 10, 2.5) my_nct = stats.nct(10, 2.5) print my_nct.ppf(0.5) ##`help(stats.nct)` prints the complete docstring of the distribution. Instead ##we can print just some basic information: print stats.nct.extradoc #contains the distribution specific docs print 'number of arguments: %d, shape parameters: %s' % (stats.nct.numargs, stats.nct.shapes) print 'bounds of distribution lower: %s, upper: %s' % (stats.nct.a, stats.nct.b) ##We can list all methods and properties of the distribution with ##`dir(stats.nct)`. Some of the methods are private methods, that are ##not named as such, i.e. no leading underscore, for example veccdf or
def get_H1_statistic_distribution(self): effect_size,N = self.effect_size,self.N ncp = self.get_ncp() #effect_size * np.sqrt(N) return stats.nct(N-1,ncp)
def run(self): self.collector = [] #main op for contamination in tqdm.tqdm(self.c_grid): samps = int(contamination * self.n_samples) if samps < 2: continue #init running metrics running_metrics = defaultdict(list) for k in self.k_grid: clf = LocalOutlierFactor(n_neighbors=k, contamination=contamination) clf.fit_predict(self.data) X_scores = np.log(-clf.negative_outlier_factor_) t0 = X_scores.argsort() #[::-1] top_k = t0[-samps:] min_k = t0[:samps] x_out = X_scores[top_k] x_in = X_scores[min_k] mc_out = np.mean(x_out) mc_in = np.mean(x_in) vc_out = np.var(x_out) vc_in = np.var(x_in) Tck = (mc_out - mc_in) / np.sqrt( (self.eps + ((1 / samps) * (vc_out + vc_in)))) running_metrics['tck'].append(Tck) running_metrics['mck_out'].append(mc_out) running_metrics['mck_in'].append(mc_in) running_metrics['vck_in'].append(vc_in) running_metrics['vck_out'].append(vc_out) largest_idx = np.array(running_metrics['tck']).argsort()[-1] mean_mc_out = np.mean(running_metrics['mck_out']) mean_mc_in = np.mean(running_metrics['mck_in']) mean_vc_out = np.mean(running_metrics['vck_out']) mean_vc_in = np.mean(running_metrics['vck_in']) #ncpc - non-centrality parameter ncpc = (mean_mc_out - mean_mc_in) / np.sqrt( (self.eps + ((1 / samps) * (mean_vc_out + mean_vc_in)))) #dfc - degrees of freedom dfc = (2 * samps) - 2 if dfc <= 0: continue Z = nct(dfc, ncpc) #non-central t-distribution Kopt = self.k_grid[largest_idx] Topt = running_metrics['tck'][largest_idx] Z = Z.cdf(Topt) self.collector.append([Kopt, Topt, Z, contamination]) max_cdf = 0. self.tuned_params = {} for v in self.collector: Kopt, Topt, Z, contamination = v if Z > max_cdf: max_cdf = Z if max_cdf == Z: self.tuned_params['k'] = Kopt self.tuned_params['c'] = contamination print("\nTuned LOF Parameters : {}".format(self.tuned_params)) return