def ibetam(a, b, x): """ Incomplete beta function defined as the Mathematica Beta[x, a, b]: Beta[x, a, b] = Integral[t^(a - 1) * (1 - t)^(b - 1), {t, 0, x}] This routine only works for (0 < x < 1) & (b > 0) as required by JAM. """ # V1.0: Michele Cappellari, Oxford, 01/APR/2008 # V2.0: Use Hypergeometric function for negative a or b. # From equation (6.6.8) of Abramoviz & Stegun (1964) # MC, Oxford, 04/APR/2008 # V3.0: Use recurrence relation of equation (26.5.16) # from Abramoviz & Stegun (1964) for (a < 0) & (b > 0). # See the online book here http://www.nr.com/aands/ # After suggestion by Gary Mamon. MC, Oxford, 16/APR/2009 a = a + 3e-7 # Perturb to avoid singularities in gamma and betainc if np.all(a > 0): ib = special.betainc(a, b, x) else: p = int(np.ceil(np.abs(np.min(a)))) tot = np.zeros((x.size, a.size)) for j in range(p): # Do NOT use gamma recurrence relation to avoid instabilities tot += special.gamma(j + b + a)/special.gamma(j + 1 + a)*x**(j + a) ib = tot*(1 - x)**b/special.gamma(b) + special.betainc(a + p, b, x) return ib*special.beta(a, b)
def integrand(x, N1, X, N2, Y, r, s, lnb): # ss.binom.cdf(X, N, p) == betainc(N-X, X+1, 1-p), but betainc accepts non-integer parameters. # However, betainc(0, ., .) and betainc(., 0, .) are nan because gamma(0) == 0 if s <= 0: # Decreased, accumulate from 0 to Y b = betainc(max(1e-19, N2 - Y), Y+1, (1-x)**r) else: # Increased, accumulate from Y to N2, or 1 - cdf(from 0 to Y-1) b = 1 - betainc(N2 - (Y-1), max(1e-19, Y), (1-x)**r) return b * np.exp( X * np.log(x) + (N1 - X) * np.log(1 - x) - lnb)
def vol_cap(r, a): unit_volume = pi**(n/2.)/gamma(n/2.+1) V = unit_volume * r**n if a >= 0: return V / 2 * betainc((n+1)/2., 0.5, 1-(a/r)**2) else: return V - vol_cap(r, -a)
def printIncbet(): successes = 1 failures = 0 for d in na.arange(0, 1, 0.01): result = betainc(successes + 1, failures + 1, d); print "Result: S:", successes, print "F:", failures, "d:", d, "inc: %.5f" % result
def betai(a, b, x): """ Returns the incomplete beta function. I_x(a,b) = 1/B(a,b)*(Integral(0,x) of t^(a-1)(1-t)^(b-1) dt) where a,b>0 and B(a,b) = G(a)*G(b)/(G(a+b)) where G(a) is the gamma function of a. The standard broadcasting rules apply to a, b, and x. Parameters ---------- a : array_like or float > 0 b : array_like or float > 0 x : array_like or float x will be clipped to be no greater than 1.0 . Returns ------- betai : ndarray Incomplete beta function. """ x = np.asarray(x) x = np.where(x < 1.0, x, 1.0) # if x > 1 then return 1.0 return special.betainc(a, b, x)
def train(self, bandit, max_budget): self.S = na.zeros(bandit.narms) * 0.0 self.F = na.zeros(bandit.narms) * 0.0 for a_i in bandit.actions: for i in range(self.n): if len(bandit.log) >= max_budget: return r = bandit.sample(a_i) if r == 1: self.S[a_i] += 1.0 else: self.F[a_i] += 1.0 Pr_mu_less_than_bound = betainc(self.S[a_i] + 1, self.F[a_i] + 1, self.upperbound) ntrials = self.S[a_i] + self.F[a_i] arm_mean = self.S[a_i] / ntrials Pr_mu_greater_than_bound = 1 - Pr_mu_less_than_bound if Pr_mu_less_than_bound >= self.confidence: break if Pr_mu_greater_than_bound >= self.confidence: print print "arm", a_i print "arm_mean", arm_mean print "s", self.S print "f", self.F return #this arm is awesome; leave else: # continue trying this arm pass
def pearsonr(A, B): """ A broadcasting method to compute pearson r and p Code reprint from stackflow ----------------------------------------------- Parameters: A: matrix A, i*k B: matrix B, j*k Return: rcorr: matrix correlation, i*j pcorr: matrix correlation p, i*j Example: >>> rcorr, pcorr = pearsonr(A, B) """ if isinstance(A,list): A = np.array(A) if isinstance(B,list): B = np.array(B) if np.ndim(A) == 1: A = np.expand_dims(A, axis=1).T if np.ndim(B) == 1: B = np.expand_dims(B, axis=1).T rcorr = 1.0 - distance.cdist(A, B, 'correlation') df = A.T.shape[1] - 2 r_forp = rcorr*1.0 r_forp[r_forp==1.0] = 0.0 t_squared = rcorr.T**2*(df/((1.0-rcorr.T)*(1.0+rcorr.T))) pcorr = special.betainc(0.5*df, 0.5, df/(df+t_squared)) return rcorr, pcorr
def Dbeta(a=1.5,b=2.5): # the beta distribution return Distr( name='beta[a={0},b={1}]'.format(a,b), dom=(0.,1.), domv=(1.e-10,1.-1.e-10), mean=a/(a+b), std=sqrt(a*b/(a+b+1))/(a+b), pdf=lambda x: x**(a-1.)*(1.-x)**(b-1.)/beta(a,b), cdf=lambda x: betainc(a,b,x), )
def p_from_r(r,n): r = max(min(r, 1.0), -1.0) df = n-2 if abs(r) == 1.0: prob = 0.0 else: t_squared = r*r * (df / ((1.0 - r) * (1.0 + r))) prob = special.betainc(0.5*df, 0.5, df / (df + t_squared)) return prob
def _sp_subvector_error_out_of_range(radius, dimensions, subdimensions): dist = SubvectorLength(dimensions, subdimensions) sq_r = radius * radius normalization = 1.0 - dist.cdf(radius) b = (dimensions - subdimensions) / 2.0 aligned_integral = beta(subdimensions / 2.0 + 1.0, b) * (1.0 - betainc( subdimensions / 2.0 + 1.0, b, sq_r)) cross_integral = beta((subdimensions + 1) / 2.0, b) * (1.0 - betainc( (subdimensions + 1) / 2.0, b, sq_r)) numerator = (sq_r * normalization + ( aligned_integral - 2.0 * radius * cross_integral) / beta( subdimensions / 2.0, b)) with np.errstate(invalid='ignore'): return np.where( numerator > np.MachAr().eps, numerator / normalization, np.zeros_like(normalization))
def cdf(self, y): a = self.a b = self.b p = self.p q = self.q z = np.exp(a * np.log((y/b)) - a*np.log(1 + (y/b))) cdf = betainc(p, q, z) return cdf
def ttest_pval(len_array, corr): ''' adapted from ~/anaconda3/envs/Py2/lib/python2.7/site-packages/scipy/stats/stats.py >> pearsonr() calculates the p-value of the correlation indices using a ttest. ''' df = len_array - 2 t_squared = corr**2 * (df / ((1.0 - corr) * (1.0 + corr))) pval = betainc(0.5*df, 0.5, df/(df+t_squared)) return pval
def _f(main, aux, tau): """Little wrapper for incomplete beta function computation because the argument order is different from ROOT and there are problems with certain sets of arguments.""" from scipy.special import betainc a = main b = aux + 1 x = 1. / (1. + tau) result = betainc(a, b, x) return result
def cdf(self, y): a = self.a b = self.b c = self.c p = self.p q = self.q z = (y/b)**a cdf = betainc(p, q, z) return cdf
def _uniform_order_statistic_cdf(i, n, t): """ _uniform_order_statistic_cdf(i, n, t) -> Pr[U_(i+1) < t] Let U_1, ..., U_n ~ Uniform[0,1] be n independent random variables and let U_(1) < ... < U_(n) denote the same variables in sorted order. This function returns the Cumulative Distribution function of U_(i+1), i.e. Pr[U_(i+1) < t] note that this function also works for numpy array inputs""" return betainc(i+1, n-i, t)
def log_sf(self,x): scalar = not isinstance(x,np.ndarray) x = np.atleast_1d(x) errs = np.seterr(divide='ignore') ret = np.log(special.betainc(x+1,self.r,self.p)) np.seterr(**errs) ret[x < 0] = np.log(1.) if scalar: return ret[0] else: return ret
def myrincbeta(x,a,b): # compute the regularized incomplete beta function. if a < 0: cbf=(sps.gamma(a)*sps.gamma(b))/sps.gamma(a+b) res = (x**a * (1.0-x)**b) / (a * cbf) return myrincbeta(x,a+1.0,b) + res else: # cbf=(sps.gamma(a)*sps.gamma(b))/sps.gamma(a+b) cbf=1.0 # sps.betainc is the regularized inc. beta fun. res=(sps.betainc(a,b,x) / cbf) return res
def ttest(m1,e1,n1,m2,e2,n2): m1 = float(m1) e1 = float(e1) m2 = float(m2) e2 = float(e2) v1 = e1**2 v2 = e2**2 t = (m1-m2)/sqrt(v1+v2) nu = (v1+v2)**2/(v1**2/(n1-1)+v2**2/(n2-1)) x = nu/(nu+t**2) p = 1.-betainc(nu/2,.5,x) return p
def F_int_approx(a, b, c, A, x, k, verbose=False): # print a, b, c, x total = 0 if x < 10e-10: return 0 for i in range(1, k+1): if not( special.betainc(A + 1, i+1, x) ): beta_inc = 10e-320 else: beta_inc = special.betainc(A + 1, i+1, x) # print A+1, i+1, x, (math.lgamma(a+i) + math.lgamma(b+i) + math.lgamma(c) ) - (math.lgamma(a) + math.lgamma(b) + math.lgamma(c+i) + math.log(math.factorial(i)) ), special.betaln(A + 1, i+1) total += \ math.exp( \ (math.lgamma(a+i) + math.lgamma(b+i) + math.lgamma(c) ) - \ (math.lgamma(a) + math.lgamma(b) + math.lgamma(c+i) + math.log(math.factorial(i)) ) + \ math.log( beta_inc ) + \ special.betaln(A + 1, i+1) ) return total
def corrcoef_matrix(matrix): # Code originating from http://stackoverflow.com/a/24547964 by http://stackoverflow.com/users/2455058/jingchao r = np.corrcoef(matrix) rf = r[np.triu_indices(r.shape[0], 1)] df = matrix.shape[1] - 2 ts = rf * rf * (df / (1 - rf * rf)) pf = betainc(0.5 * df, 0.5, df / (df + ts)) p = np.zeros(shape=r.shape) p[np.triu_indices(p.shape[0], 1)] = pf p[np.tril_indices(p.shape[0], -1)] = pf p[np.diag_indices(p.shape[0])] = np.ones(p.shape[0]) return r, p
def check_sample_mean(sm, v, n, popmean): # from stats.stats.ttest_1samp(a, popmean): # Calculates the t-obtained for the independent samples T-test on ONE group # of scores a, given a population mean. # # Returns: t-value, two-tailed prob df = n-1 svar = ((n-1)*v) / float(df) # looks redundant t = (sm-popmean) / np.sqrt(svar*(1.0/n)) prob = betainc(0.5*df, 0.5, df/(df + t*t)) # return t,prob npt.assert_(prob > 0.01, 'mean fail, t,prob = %f, %f, m, sm=%f,%f' % (t, prob, popmean, sm))
def bdtr(k, n, p): if (k < 0): return np.nan if (k == n): return (1.0) dn = n - k if (k == 0): dk = np.exp(dn * np.log(1.0 - p)) else: dk = k + 1 dk = betainc(dn, dk, 1.0 - p) return dk
def compute_policy(successes, failures, mu, epsilon, threshold_confidence, accept_confidence, reject_confidence): p_in_threshold = ( betainc(successes + 1, failures + 1, mu + epsilon) - betainc(successes + 1, failures + 1, mu - epsilon)) p_below = betainc(successes + 1, failures + 1, mu) p_above = 1 - p_below if p_above > accept_confidence: result = "a" elif p_in_threshold > threshold_confidence: result = "t" elif p_below > reject_confidence: result = "r" else: result = "c" # result = "c" if successes == 0 and failures == 0: result = 'c' elif successes == 0: result = 'r' # if p_above > accept_confidence: # result = "a" # else: # result = "c" # if p_in_threshold > threshold_confidence: # result = "t" # if p_below > reject_confidence: # result = "r" # else: # result = "c" return result
def printThresholds(): max_idx = 20 mu = 0.7 epsilon = 0.2 threshold_confidence = 0.7 accept_confidence = 0.7 reject_confidence = 0.95 print "".rjust(2), for x in na.arange(0, max_idx): print ("%d" % x).rjust(2), print for successes in na.arange(0, max_idx): print ("%d" % successes).rjust(2), for failures in na.arange(0, max_idx): p_in_threshold = ( betainc(successes + 1, failures + 1, mu + epsilon) - betainc(successes + 1, failures + 1, mu - epsilon)) p_below = betainc(successes + 1, failures + 1, mu) p_above = 1 - p_below if p_in_threshold > threshold_confidence: result = "t" elif p_below > reject_confidence: result = "r" elif p_above > accept_confidence: result = "A" else: result = "c" if (successes + failures <= 10): print str(result).rjust(2), else: print " ".rjust(2), #print successes, failures, mu, "p(s): %.4f" %(1 - probability), #print "p(f): %.4f" % probability print
def relative_error(self, confidence=0.98, D=0): p = 0 if D: try: from scipy import special, optimize except ImportError: raise Exception("Scipy needed for relative error bounds") k = self.k u = lambda D, k, e: (k - 1.0) / ((1.0 - e) * D) l = lambda D, k, e: (k - 1.0) / ((1.0 + e) * D) objective = ( lambda e, D, k, confidence: special.betainc(k, D - k + 1, u(D, k, e)) - special.betainc(k, D - k + 1, l(D, k, e)) - confidence ) try: p = optimize.newton(objective, x0=0.05, args=(D, k, confidence)) except RuntimeError: pass else: p = math.sqrt(2.0 / (math.pi * (self.k - 2))) return p
def bdtrc(k, n, p): if (k < 0): return (1.0) if (k == n): return (0.0) dn = n - k if (k == 0): if (p < .01): dk = -np.expm1(dn * np.log1p(-p)) else: dk = 1.0 - np.exp(dn * np.log(1.0 - p)) else: dk = k + 1 dk = betainc(dk, dn, p) return dk
def my_t1cdf(x): ''' cumulative distribution function of a t-dist. with 1 degree of freedom function p=my_t1cdf(x) input x = point output p = cumulative probability see also: tcdf ''' xsq=x*x; p = betainc(1 / (1 + xsq), 1/2, 1/2) / 2 p[x>0]=1-p[x>0] return p
def resid_anscombe(self, Y, mu): ''' The Anscombe residuals Parameters ---------- Y : array-like Endogenous response variable mu : array-like Fitted mean response variable Returns ------- resid_anscombe : array The Anscombe residuals as defined below. Formulas --------- sqrt(n)*(cox_snell(Y)-cox_snell(mu))/(mu**(1/6.)*(1-mu)**(1/6.)) where cox_snell is defined as cox_snell(x) = betainc(2/3., 2/3., x)*betainc(2/3.,2/3.) where betainc is the incomplete beta function Notes ----- The name 'cox_snell' is idiosyncratic and is simply used for convenience following the approach suggested in Cox and Snell (1968). Further note that cox_snell(x) = x**(2/3.)/(2/3.)*hyp2f1(2/3.,1/3.,5/3.,x) where hyp2f1 is the hypergeometric 2f1 function. The Anscombe residuals are sometimes defined in the literature using the hyp2f1 formulation. Both betainc and hyp2f1 can be found in scipy. References ---------- Anscombe, FJ. (1953) "Contribution to the discussion of H. Hotelling's paper." Journal of the Royal Statistical Society B. 15, 229-30. Cox, DR and Snell, EJ. (1968) "A General Definition of Residuals." Journal of the Royal Statistical Society B. 30, 248-75. ''' cox_snell = lambda x: special.betainc(2/3., 2/3., x)\ *special.beta(2/3.,2/3.) return np.sqrt(self.n)*(cox_snell(Y)-cox_snell(mu))/\ (mu**(1/6.)*(1-mu)**(1/6.))
def _ccprmod(self, proba, target, B=20): """ Cf MATLAB code CCPRMOD Classifier competence based on probabilistic modelling cc = ccprmod(d,j,B) Input: proba - NxC matrix of normalised C class supports produced by the classifier for N objects target - Nx1 vector of indices of the correct classes for N objects B - number of points used in the calculation of the competence, higher values result in a more accurate estimation (optional, default B=20) Output: competences - Nx1 vector of the classifier competences """ n_sample, n_classes = proba.shape # Generating points x = np.linspace(0, 1, B) x = repmat(x, n_sample, n_classes) # Calculating parameters of the beta pdfs a = np.zeros(x.shape) b = np.zeros(x.shape) betaincj = np.zeros(x.shape) for c in range(n_classes): a[:, c*B: (c+1)*B] = repmat(n_classes*proba[:,c].reshape(-1, 1),1,B) b = n_classes - a a[a==0] = 1e-9 b[b==0] = 1e-9 x[x==0] = 1e-9 betaincj = betainc(a, b, x) # calculating competences cc = np.zeros((n_sample, 1)) for n in range(n_sample): t = range(target[n]*B, (target[n]+1)*B) bc = betaincj[n, t] setdiff = list(set(range(n_classes*B)) - set(t)) bi = betaincj[n, setdiff] bi = np.reshape(bi, (n_classes-1, B)) cc[n] = sum((bc[1:] - bc[:-1])*np.prod((bi[:,:-1] + bi[:,1:])/2, axis=0)) return cc
def fv_test(x0,x1): # taken from IDL library nx0 = len(x0) nx1 = len(x1) v0 = np.var(x0) v1 = np.var(x1) if v0 >v1: f = v0/v1 df0 = nx1-1 df1 = nx0-1 else: f = v1/v0 df0 = nx1-1 df1 = nx0-1 prob = 2.0*betainc(0.5*df1,0.5*df0,df1/(df1+df0*f)) if prob >1: return (f,2.0-prob) else: return (f,prob)
def pearsonr(x, y, eps=1e-5): r""" Calculate a Pearson correlation coefficient and the p-value for testing non-correlation. The Pearson correlation coefficient measures the linear relationship between two datasets. Strictly speaking, Pearson's correlation requires that each dataset be normally distributed, and not necessarily zero-mean. Like other correlation coefficients, this one varies between -1 and +1 with 0 implying no correlation. Correlations of -1 or +1 imply an exact linear relationship. Positive correlations imply that as x increases, so does y. Negative correlations imply that as x increases, y decreases. The p-value roughly indicates the probability of an uncorrelated system producing datasets that have a Pearson correlation at least as extreme as the one computed from these datasets. The p-values are not entirely reliable but are probably reasonable for datasets larger than 500 or so. Parameters ---------- x : (N,) array_like Input y : (N,) array_like Input Returns ------- r : float Pearson's correlation coefficient p-value : float 2-tailed p-value Notes ----- The correlation coefficient is calculated as follows: .. math:: r_{pb} = \frac{\sum (x - m_x) (y - m_y)} {\sqrt{\sum (x - m_x)^2 \sum (y - m_y)^2}} where :math:`m_x` is the mean of the vector :math:`x` and :math:`m_y` is the mean of the vector :math:`y`. References ---------- http://www.statsoft.com/textbook/glosp.html#Pearson%20Correlation Examples -------- >>> from scipy import stats >>> a = np.array([0, 0, 0, 1, 1, 1, 1]) >>> b = np.arange(7) >>> stats.pearsonr(a, b) (0.8660254037844386, 0.011724811003954654) >>> stats.pearsonr([1,2,3,4,5], [5,6,7,8,7]) (0.83205029433784372, 0.080509573298498519) """ # x and y should have same length. x = np.asarray(x) y = np.asarray(y) n = x.shape[-1] mx = x.mean(axis=-1, keepdims=True) my = y.mean(axis=-1, keepdims=True) xm, ym = x - mx, y - my r_num = np.sum(xm * ym, axis=-1) # r_den = np.sqrt(sum_of_squares(xm) * sum_of_squares(ym)) xm = np.sum(xm**2, axis=-1) ym = np.sum(ym**2, axis=-1) r_den = np.sqrt(xm * ym) idx = np.where(r_den==0)[0] r_den[idx] = eps r = r_num / r_den r[idx] = 0.0 # Presumably, if abs(r) > 1, then it is only some small artifact of # floating point arithmetic. r = np.clip(r, -1.0, 1.0) df = n - 2 idx = np.where(abs(r) == 1.0)[0] r[idx] += eps t_squared = r**2 * (df / ((1.0 - r) * (1.0 + r))) prob = special.betainc( 0.5*df, 0.5, np.fmin(np.asarray(df / (df + t_squared)), 1.0) ) prob[idx] = 0.0 r[idx] -= eps return r, prob
def p(k, F): g = 2 * F * F + 1 a = g / (1 + g) return np.power(g, -.5 * k) * np.power(a, k) / ( k * beta(.5 * k, .5 * k + 1)) + 1 - betainc(.5 * k, .5 * k + 1, a)
def cdf(dist, a, b, *args, **kwargs): G = dist.cdf(*args, **kwargs) return sp.betainc(a, b, G)
def _betai(a, b, x): x = np.asarray(x) x = np.where(x < 1.0, x, 1.0) # if x > 1 then return 1.0 return betainc(a, b, x)
def _pearsonr(x: xr.DataArray, y: xr.DataArray, monitor: Monitor) -> xr.Dataset: """ Calculates Pearson correlation coefficients and p-values for testing non-correlation of lon/lat/time xarray datasets for each lon/lat point. Heavily influenced by scipy.stats.pearsonr The Pearson correlation coefficient measures the linear relationship between two datasets. Strictly speaking, Pearson's correlation requires that each dataset be normally distributed, and not necessarily zero-mean. Like other correlation coefficients, this one varies between -1 and +1 with 0 implying no correlation. Correlations of -1 or +1 imply an exact linear relationship. Positive correlations imply that as x increases, so does y. Negative correlations imply that as x increases, y decreases. The p-value roughly indicates the probability of an uncorrelated system producing datasets that have a Pearson correlation at least as extreme as the one computed from these datasets. The p-values are not entirely reliable but are probably reasonable for datasets larger than 500 or so. :param x: lon/lat/time xr.DataArray :param y: xr.DataArray of the same spatiotemporal extents and resolution as x. :param monitor: Monitor to use for monitoring the calculation :return: A dataset containing the correlation coefficients and p_values on the lon/lat grid of x and y. References ---------- http://www.statsoft.com/textbook/glosp.html#Pearson%20Correlation """ with monitor.starting("Calculate Pearson correlation", total_work=6): n = len(x['time']) xm, ym = x - x.mean(dim='time'), y - y.mean(dim='time') xm_ym = xm * ym r_num = xm_ym.sum(dim='time') xm_squared = xr.ufuncs.square(xm) ym_squared = xr.ufuncs.square(ym) r_den = xr.ufuncs.sqrt(xm_squared.sum(dim='time') * ym_squared.sum(dim='time')) r_den = r_den.where(r_den != 0) r = r_num / r_den # Presumably, if abs(r) > 1, then it is only some small artifact of floating # point arithmetic. # At this point r should be a lon/lat dataArray, so it should be safe to # load it in memory explicitly. This may take time as it will kick-start # deferred processing. # Comparing with NaN produces warnings that can be safely ignored default_warning_settings = np.seterr(invalid='ignore') with monitor.child(1).observing("task 1"): negativ_r = r.values < -1.0 with monitor.child(1).observing("task 2"): r.values[negativ_r] = -1.0 with monitor.child(1).observing("task 3"): positiv_r = r.values > 1.0 with monitor.child(1).observing("task 4"): r.values[positiv_r] = 1.0 np.seterr(**default_warning_settings) r.attrs = {'description': 'Correlation coefficients between' ' {} and {}.'.format(x.name, y.name)} df = n - 2 t_squared = xr.ufuncs.square(r) * (df / ((1.0 - r.where(r != 1)) * (1.0 + r.where(r != -1)))) prob = df / (df + t_squared) with monitor.child(1).observing("task 5"): prob_values_in = prob.values with monitor.child(1).observing("task 6"): prob.values = betainc(0.5 * df, 0.5, prob_values_in) prob.attrs = {'description': 'Rough indicator of probability of an' ' uncorrelated system producing datasets that have a Pearson' ' correlation at least as extreme as the one computed from' ' these datsets. Not entirely reliable, but reasonable for' ' datasets larger than 500 or so.'} retset = xr.Dataset({'corr_coef': r, 'p_value': prob}) return retset
def zero_beta(x, *args): q, r, normal_val = args zero_beta = np.absolute(spec.betainc(q, r, x) - normal_val) return zero_beta
def rolling_pr_rmsd(timestamps, data, window_size, center, min_periods): """ DEPRECATED: use the faster version in metrics._fast instead! Only here for testing. Computation of rolling Pearson R. Parameters ---------- timestamps : float64 Time stamps as julian dates. data : numpy.ndarray Time series data in 2d array. window_size : float Window size in fraction of days. center : bool Set window at the center. min_periods : int Minimum number of observations in window required for computation. Results ------- pr_arr : numpy.array Pearson R and p-value. """ pr_arr = np.empty((timestamps.size, 2), dtype=np.float32) rmsd_arr = np.empty(timestamps.size, dtype=np.float32) ddof = 0 for i in range(timestamps.size): time_diff = timestamps - timestamps[i] if center: inside_window = np.abs(time_diff) <= window_size else: inside_window = (time_diff <= 0) & (time_diff > -window_size) idx = np.nonzero(inside_window)[0] n_obs = inside_window.sum() if n_obs == 0 or n_obs < min_periods: pr_arr[i, :] = np.nan else: sub1 = data[idx[0]:idx[-1] + 1, 0] sub2 = data[idx[0]:idx[-1] + 1, 1] # pearson r pr_arr[i, 0] = np.corrcoef(sub1, sub2)[0, 1] # p-value if np.abs(pr_arr[i, 0]) == 1.0: pr_arr[i, 1] = 0.0 else: df = n_obs - 2. t_squared = pr_arr[i, 0]*pr_arr[i, 0] * \ (df / ((1.0 - pr_arr[i, 0]) * (1.0 + pr_arr[i, 0]))) x = df / (df + t_squared) x = np.ma.where(x < 1.0, x, 1.0) pr_arr[i, 1] = betainc(0.5 * df, 0.5, x) # rmsd rmsd_arr[i] = np.sqrt( np.sum((sub1 - sub2)**2) / (sub1.size - ddof)) return pr_arr, rmsd_arr
def cumulative_probability(self, x): return betainc(self.a, self.b, x) / beta(self.a, self.b)
def OLS_AR1Corr(TheData, TheMDI, TheConfRange): # ,Lowee=Lowee,Highee=Highee): ''' Calculates the linear trend using Ordinary Least Squares regression ''' ''' Can cope with specified missing data indicator TheMDI ''' ''' TheData: a numpy array of single time series data - can have missing data = TheMDI''' ''' TheMDI: a number used to identify missing data (not NaN) ''' ''' TheConfRange: a number between 0 and 1 for the desired confidence interval e.g. 0.9 for 90th pct CI ''' ''' TheSlope[0]: Outputs the slope at a rate of unit per time step ''' ''' TheSlope[1:3]: Outputs the 5th and 95th pctile standard error confidence intervals (90pct confidence intervals) around the slope corrected for AR(1) correlation ''' ''' TheSlope[3]: Outputs the 1 sigma standard error ''' ''' TheSlope[4]: Outputs the +/- Confidence Interval for the given p-value ''' ''' TheSlope[5]: Outputs the AR(1) correlation of regression residuals ''' ''' TheSlope[6]: Outputs the effective degrees of freedom ''' ''' TheSlope[7]: Outputs the p-value of the trend using two-sided students t test - can we reject H0 of no trend ''' ''' Santer et al., 2008 - methodology ''' ''' If Lowee and/or Highee are set they will come out as changed values ''' ''' THis is intended to be identical to Alexey Kaplans IDL code which is almost identical to Samter et al ''' ''' I have tested this with TESTDATA/data4S2008comp.dat and the IDL code and get identical values ''' ''' The Kaplan code invokes a different method for missing data: ''' ''' Data are compressed to only non-missing and then processed ''' ''' I will test both running my method with missing data and compressing then running ''' ''' The Kaplan method adds two caveats which I will add here: ''' ''' If autocorrelation is -ve then set to 0 ''' ''' If the effective Deg of Freedom < 3 then Inf or NaN are returned ''' ''' Something else about the regression residuals using indices but I don't understand ''' # Check the desired confidence range is between 0-1 if ((TheConfRange < 0.) | (TheConfRange > 1.)): raise Exception("invalid confidnece range - should be between 0 and 1") # Set up empty list for returning the output values # - slope per unit time, # - lower (10th pct) CI, # - upper (90th pct) CI] # - 1 sigma SE # - The +/- confidence interval for the given p value # - AR(1) correlation in the residuals # - the effective degrees of freedom # - the p-value for the trend TheSlope = np.array([0., 0., 0., 0., 0., 0., 0., 0.]) # Convert the data to a pandas dataframe? # First set any missing values to NaNs TheDataNANs = np.copy(TheData) # copying just to be safe gots = np.where(TheDataNANs == TheMDI) # ADD A CATCH FOR No. Data points < 3 as in KAPLAN (actually I'm setting it to 50% missing!!! if ((float(len(gots[0])) / float(len(TheData))) > 0.5): TheSlope[0:8] = TheMDI print('Fewer than 50% valid data points') # pdb.set_trace() return TheSlope # Tested if (len(gots[0]) > 0): TheDataNANs[np.where(TheDataNANs == TheMDI)] = float('NaN') DataDF = pd.DataFrame(TheDataNANs, index=np.arange(len(TheDataNANs)), columns=['variable'], dtype=float, copy=True) # This needs to be a copy otherwise changes to TheData lead to changes to DataDF.variable[] # As we are using formula= then we don't need to specify a column of 1s and an intercept is calculated olsmod = smf.ols(formula='variable ~ np.arange(len(TheDataNANs))', data=DataDF, missing='drop') # drop the NaNs olsres = olsmod.fit() # olsmod.summary() prints all # olsmod.params prints the slopes for each item 0 = intercept, 1 = variable # olsmod.predict() prints the predicted values using the model fit, including intercept # olsmod.bse prints the standard errors (1 sigma using n-2 degrees of freedom) 0 = intercept, 1 = variable TheSlope[0] = olsres.params[1] # print('Decadal Trend: ',np.round(TheSlope[0]*120,4)) #pdb.set_trace() # Now get the original 1 sigma standard error which uses n-2 degrees of freedom and then calculate the corrected one # First we need to use masked arrays to make sure we account for missing data TheDataMSK = np.ma.masked_equal( TheData, TheMDI ) # better hope that this captures them all and nothing silly with floats # Now get the time series of regression residuals for each data point # First create a masked array of missing data TheResids = np.ma.masked_equal(np.repeat(TheMDI, len(TheDataMSK)), TheMDI) # Get a pointer array to the non-missing data but need to test if there are any missing first if (np.ma.count(TheDataMSK) == len(TheData)): print('no missing') # then we do not need to faff with the missing data # Subtract the predicted values for each time point from the actual values TheResids = TheDataMSK - olsres.predict( ) # if there are missing values then these won't be predicted so we need to fill back in MaskofPoints = np.arange(len(TheData)) else: # we do need to faff with the missing data print('got a missing') MaskofPoints = np.where(np.ma.getmask(TheDataMSK) == False) # Subtract the predicted values for each time point from the actual values TheResids[MaskofPoints] = TheDataMSK[MaskofPoints] - olsres.predict( ) # if there are missing values then these won't be predicted so we need to fill back in # We need the AR(1) values of the regression residuals and shoudl probably test to make sure the data are autocorrelated # Using the np.ma.corrcoef works even if all data are present # This ignores missing data points so isn't ideal - better to take the longest continuous period of data? Lag1AR = np.ma.corrcoef(TheResids[0:-1], TheResids[1:])[0][1] TheSlope[5] = Lag1AR # print('Autocorrelation at lag 1: ',np.round(Lag1AR,4)) #pdb.set_trace() # ADD A CATCH FOR NEGATIVE AR(1) - as in Kaplan # ERROR - If AR(1) is negative then it should be given a value of 0 so it has no effect on reducing deg of freedom # previously I had set the trends to MDI - IDIOT!!! if (Lag1AR < 0.): Lag1AR = 0. #TheSlope[0:5] = TheMDI #TheSlope[6] = TheMDI # print('Negative AR(1)') #return TheSlope # Tested # This is the original number of samples of time NOT INCLUDING MISSING DATA POINTS nORIG = np.ma.count(TheDataMSK) # number of data points # Now get the effective number of samples dependent on the degree of autocorrelation at lag 1 nEFF = nORIG * ((1 - Lag1AR) / (1 + Lag1AR)) TheSlope[6] = nEFF # print('Original no. time points: ',nORIG) # print('Effective no. time points: ',np.round(nEFF,4)) #pdb.set_trace() # ADD A CATCH FOR nEFF < 3 as in KAPLAN if (nEFF < 3): TheSlope[1:6] = TheMDI TheSlope[7] = TheMDI # print('Fewer than 3 effective degrees of freedom: ',nEFF) return TheSlope # Tested # Now get the variance of the regression residuals s_e^2 s_eSQ = (1 / (nEFF - 2)) * np.ma.sum(TheResids**2) # and just for comparison get it for the original number of samples s_eSQORIG = (1 / (nORIG - 2)) * np.ma.sum(TheResids**2) # print('Decade Original variance of regression residuals: ',np.round(s_eSQORIG*120,4)) # print('Decade Effective variance of regression residuals: ',np.round(s_eSQ*120,4)) #pdb.set_trace() # Now calculate the 1 sigma standard error s_1sig = (s_eSQ / np.sum((MaskofPoints - np.mean(MaskofPoints))**2))**0.5 # and just for comparison get it for the original number of samples s_1sigORIG = (s_eSQORIG / np.sum( (MaskofPoints - np.mean(MaskofPoints))**2))**0.5 TheSlope[3] = s_1sig # print('Decade Original 1 sigma standard error: ',np.round(s_1sigORIG*120,4)) # print('Decade Effective 1 sigma standard error: ',np.round(s_1sig*120,4)) #pdb.set_trace() # Now calculate the p-value to test whether the H0 (no trend) is rejected (if p-value < 0.05) t_students_2tail = TheSlope[0] / s_1sig integration_lev = (nEFF - 2.0) / ((nEFF - 2.0) + t_students_2tail**2.) TheSlope[7] = betainc((nEFF - 2.0) / 2., 0.5, integration_lev) # pdb.set_trace() # Now find the 90th percentile confidence intervals by integrating the area under the assumed curve # and populate TheSlope array with the lower and upper bound # I INCORRECTLY assumed that this is slope - 2*s_1sig and slope + 2*s_1sig - this would actually be 95pct confidence intervals approximately # This uses the inverse of students t CDF (quantile function) using the bisection method of the incomplete beta function (scipy.special.betainc) # When later the slope may be multiplied to get decadal trend the standard errors should be multiplied likewise ConfInt = CI_tINV(s_1sig, TheConfRange, nEFF) TheSlope[4] = ConfInt # print('Confidence interval for the p-value ', ThePvalue,' :',np.round(ConfInt*120,4)) TheSlope[1] = TheSlope[0] - ConfInt TheSlope[2] = TheSlope[0] + ConfInt # print('Decade AR(1) corrected 90th pct standard error confidence intervals: ',np.round(TheSlope[1]*120,4),np.round(TheSlope[2]*120,4)) #pdb.set_trace() return TheSlope # ReadData
def _testBetaInc(self, dtype): try: from scipy import special # pylint: disable=g-import-not-at-top np_dt = dtype.as_numpy_dtype # Test random values a_s = np.abs(np.random.randn(10, 10) * 30).astype( np_dt) # in (0, infty) b_s = np.abs(np.random.randn(10, 10) * 30).astype( np_dt) # in (0, infty) x_s = np.random.rand(10, 10).astype(np_dt) # in (0, 1) with self.test_session(use_gpu=self.use_gpu): tf_a_s = tf.constant(a_s, dtype=dtype) tf_b_s = tf.constant(b_s, dtype=dtype) tf_x_s = tf.constant(x_s, dtype=dtype) tf_out = tf.betainc(tf_a_s, tf_b_s, tf_x_s).eval() scipy_out = special.betainc(a_s, b_s, x_s).astype(np_dt) # the scipy version of betainc uses a double-only implementation. # TODO(ebrevdo): identify reasons for (sometime) precision loss # with doubles tol = 1e-4 if dtype == tf.float32 else 5e-5 self.assertAllCloseAccordingToType(scipy_out, tf_out, rtol=tol, atol=tol) # Test out-of-range values (most should return nan output) combinations = list( itertools.product([-1, 0, 0.5, 1.0, 1.5], repeat=3)) a_comb, b_comb, x_comb = np.asarray(list(zip(*combinations)), dtype=np_dt) with self.test_session(use_gpu=self.use_gpu): tf_comb = tf.betainc(a_comb, b_comb, x_comb).eval() scipy_comb = special.betainc(a_comb, b_comb, x_comb).astype(np_dt) self.assertAllCloseAccordingToType(scipy_comb, tf_comb) # Test broadcasting between scalars and other shapes with self.test_session(use_gpu=self.use_gpu): self.assertAllCloseAccordingToType(special.betainc( 0.1, b_s, x_s).astype(np_dt), tf.betainc(0.1, b_s, x_s).eval(), rtol=tol, atol=tol) self.assertAllCloseAccordingToType(special.betainc( a_s, 0.1, x_s).astype(np_dt), tf.betainc(a_s, 0.1, x_s).eval(), rtol=tol, atol=tol) self.assertAllCloseAccordingToType(special.betainc( a_s, b_s, 0.1).astype(np_dt), tf.betainc(a_s, b_s, 0.1).eval(), rtol=tol, atol=tol) self.assertAllCloseAccordingToType(special.betainc( 0.1, b_s, 0.1).astype(np_dt), tf.betainc(0.1, b_s, 0.1).eval(), rtol=tol, atol=tol) self.assertAllCloseAccordingToType(special.betainc( 0.1, 0.1, 0.1).astype(np_dt), tf.betainc(0.1, 0.1, 0.1).eval(), rtol=tol, atol=tol) with self.assertRaisesRegexp(ValueError, "Shapes .* are not compatible"): tf.betainc(0.5, [0.5], [[0.5]]) with self.test_session(use_gpu=self.use_gpu): with self.assertRaisesOpError("Shapes of .* are inconsistent"): a_p = tf.placeholder(dtype) b_p = tf.placeholder(dtype) x_p = tf.placeholder(dtype) tf.betainc(a_p, b_p, x_p).eval(feed_dict={ a_p: 0.5, b_p: [0.5], x_p: [[0.5]] }) except ImportError as e: tf.logging.warn("Cannot test special functions: %s" % str(e))
def f(x): return betainc(a, b, x)
def cdf(aa, bb, x): return sp.betainc(aa, bb, x) * sp.beta(aa, bb)
def betainc(x, a, b): return sc_special.betainc(a, b, x)
def qso_engine(time, data, error, ltau=3., lvar=-1.7, sys_err=0., return_model=False): """Calculates the fit quality of a damped random walk to a qso lightcurve. The formalism is from Rybicki & Press (1994; arXiv:comp-gas/9405004) Data are modelled with a covariance function Lij = 0.5*var*tau*exp(-|time_i-time_j|/tau) . Input: time - measurement times, typically days data - measured magnitudes error - uncertainty in measured magnitudes Output (dictionary): chi2/nu - classical variability measure chi2_qso/nu - for goodness of fit given fixed parameters chi2_qso/nu_extra - for parameter fitting, add to chi2/nu chi^2/nu_NULL - expected chi2/nu for non-qso variable signif_qso - significance chi^2/nu<chi^2/nu_NULL (rule out false alarm) signif_not_qso - significance chi^2/nu>1 (rule out qso) signif_vary - significance that source is variable class - resulting source type (ambiguous, not_qso, qso) model - time series prediction for each datum given all others (iff return_model==True) dmodel - model uncertainty, including uncertainty in data Notes: T = L^(-1) Data variance is D Full covariance C^(-1) = (L+D)^(-1) = T [T+D^(-1)]^(-1) D^(-1) Code takes advantage of the tridiagonality of T and T+D^(-1). """ out_dict = {} out_dict['chi2_qso/nu'] = 999 out_dict['chi2_qso/nu_extra'] = 0. out_dict['signif_qso'] = 0. out_dict['signif_not_qso'] = 0. out_dict['signif_vary'] = 0. out_dict['chi2_qso/nu_NULL'] = 0. out_dict['chi2/nu'] = 0. out_dict['nu'] = 0 out_dict['model'] = [] out_dict['dmodel'] = [] out_dict['class'] = 'ambiguous' lvar0 = np.log10(0.5) + lvar + ltau ln = len(data) dt = abs(time[1:] - time[:-1]) # first make sure all dt>0 g = np.where(dt > 0.)[0] lg = len(g) # must have at least 2 data points if lg <= 0: return out_dict if return_model: model = 1. * data dmodel = -1. * error if lg < ln: dt = dt[g] gg = np.zeros(lg + 1, dtype='int64') gg[1:] = g + 1 dat = data[gg] wt = 1. / (sys_err**2 + error[gg]**2) ln = lg + 1 else: dat = 1. * data wt = 1. / (sys_err**2 + error**2) out_dict['nu'] = ln - 1. varx = np.var(dat) dat0 = (dat * wt).sum() / wt.sum() out_dict['chi2/nu'] = ((dat - dat0)**2 * wt).sum() / out_dict['nu'] # define tridiagonal matrix T = L^(-1) # sparse matrix form: ab[u + i - j, j] == a[i,j] i<=j, (here u=1) T = np.zeros((2, ln), dtype='float64') arg = dt * np.exp(-np.log(10) * ltau) ri = np.exp(-arg) ei = 1. / (1. / ri - ri) T[0, 1:] = -ei T[1, :-1] = 1. + ri * ei T[1, 1:] += ri * ei T[1, ln - 1] += 1. T0 = np.median(T[1, :]) T /= T0 # equation for chi2_qso is [ (dat-x0) T Tp^(-1) D^(-1) (dat-x0) ] , where Tp=T+D^(-1) and D^(-1)=wt fac = np.exp(np.log(10) * lvar0) / T0 Tp = 1. * T Tp[1, :] += wt * fac # solve Tp*z=y for z (y=wt*dat) # This works for scipy __version__>='0.9.0' on anathem (20120809) b1 = (wt * dat).reshape((1, ln)) b2 = b1.T #(Tpc,z) = solveh_banded(Tp,b2) z = solveh_banded(Tp, b2) Tpc = cholesky_banded( Tp ) # the solveh_banded() function used to return the cholesky matrix, now we get seperately z = z.T z = z[0, :] c1 = wt.reshape((1, ln)) c2 = c1.T #(Tpc,z0) = solveh_banded(Tp,c2) z0 = solveh_banded(Tp, c2) #HAS NOT CHANGED#Tpc2 = cholesky_banded(Tp) z0 = z0.T z0 = z0[0, :] #finally, get u=T*z u = T[1, :] * z u[1:] += T[0, 1:] * z[:-1] u[:-1] += T[0, 1:] * z[1:] u0 = T[1, :] * z0 u0[1:] += T[0, 1:] * z0[:-1] u0[:-1] += T[0, 1:] * z0[1:] # magnitude offset x0, error = 1./sqrt(u0sum) u0sum = u0.sum() x0 = u.sum() / u0sum # fit statistic out_dict['chi2_qso/nu'] = np.dot(dat - x0, u - u0 * x0) / out_dict['nu'] # -2*log(likelihood) = chi2_qso + ldet_C + log(u0sum) # first term: use chi2_qso/nu for goodness of fit with fixed parameters; # all terms: use chi2_qso/nu + chi2_qso/nu_extra for fitting with variable parameters # get log of determinant for use later Tc = cholesky_banded(T) ldet_Tp = 2 * np.log(Tpc[1, :]).sum() ldet_T = 2 * np.log(Tc[1, :]).sum() ldet_C = ldet_Tp - ldet_T - np.log(wt).sum() out_dict['chi2_qso/nu_extra'] = (ldet_C + np.log(u0sum)) / out_dict['nu'] # get trace of C^(-1) for significance calculation Tpm = chol_inverse_diag(Tpc) diagC = T[1, :] * wt * Tpm[1, :] diagC[:-1] += T[0, 1:] * wt[0:-1] * Tpm[0, 1:] diagC[1:] += T[0, 1:] * wt[1:] * Tpm[0, 1:] TrC = diagC.sum() # significance in sigma units (large means false alarm unlikely) # (expected value of chi2_qso under the NULL hypothesis is TrC*varx) out_dict['chi2_qso/nu_NULL'] = TrC * varx / out_dict['nu'] a = ln / 2. x = (out_dict['chi2_qso/nu'] + 1.e-8) / (out_dict['chi2_qso/nu_NULL'] + out_dict['chi2_qso/nu'] + 1.e-8) prob = betainc(a, a, x) if prob <= 0: lprob = a * np.log(x) - np.log(a) + gammaln(2 * a) - 2 * gammaln(a) else: lprob = np.log(prob) out_dict['signif_qso'] = lprob2sigma(lprob) a = ln / 2. x = 1. / (1. + out_dict['chi2_qso/nu']) prob = betainc(a, a, x) if prob <= 0: lprob = a * np.log(x) - np.log(a) + gammaln(2 * a) - 2 * gammaln(a) else: lprob = np.log(prob) out_dict['signif_not_qso'] = lprob2sigma(lprob) x = out_dict['chi2/nu'] * out_dict['nu'] prob = gammaincc(0.5 * out_dict['nu'], 0.5 * x) if prob <= 0: lprob = (0.5 * out_dict['nu'] - 1) * np.log( x) - 0.5 * x - 0.5 * out_dict['nu'] * np.log(2) - gammaln( 0.5 * out_dict['nu']) else: lprob = np.log(prob) out_dict['signif_vary'] = lprob2sigma(lprob) if out_dict['signif_vary'] > 3: if out_dict['signif_qso'] > 3: out_dict['class'] = 'qso' elif out_dict['signif_not_qso'] > 3: out_dict['class'] = 'not_qso' # best-fit model for the lightcurve if return_model: model[gg] = dat - (u - u0 * x0) / diagC dmodel[gg] = 1. / np.sqrt(diagC) out_dict['model'] = model out_dict['dmodel'] = dmodel return out_dict
def f(x): y = n / (x**2 + n) if x > 0: return 0.5 * special.betainc(n / 2, 0.5, y) else: return 1.0 - 0.5 * special.betainc(n / 2, 0.5, y)
def _cdf(self, x, n, pr): k = floor(x) vals = (special.betainc(n, k + 1, pr) - special.betainc(n, 1, pr)) return vals / (1. - pr ** n)
def Binc(a, b, x): return betainc(a, b, x) * gamma(a) * gamma(b) / gamma(a + b)
def cdf(self, x): """Evaluates the CDF along the values ``x``.""" y = 0.5 * betainc(self.m / 2.0, 0.5, np.sin(np.pi * x)**2) return np.where(x < 0.5, y, 1 - y)
def ccprmod(supports, idx_correct_label, B=20): """Python implementation of the ccprmod.m (Classifier competence based on probabilistic modelling) function. Matlab code is available at: http://www.mathworks.com/matlabcentral/mlc-downloads/downloads/submissions/28391/versions/6/previews/ccprmod.m/index.html Parameters ---------- supports: array of shape = [n_samples, n_classes] containing the supports obtained by the base classifier for each class. idx_correct_label: array of shape = [n_samples] containing the index of the correct class. B : int (Default = 20) number of points used in the calculation of the competence, higher values result in a more accurate estimation. Returns ------- C_src : array of shape = [n_samples] representing the classifier competences at each data point Examples -------- >>> supports = [[0.3, 0.6, 0.1],[1.0/3, 1.0/3, 1.0/3]] >>> idx_correct_label = [1,0] >>> ccprmod(supports,idx_correct_label) ans = [0.784953394056843, 0.332872292262951] References ---------- T.Woloszynski, M. Kurzynski, A probabilistic model of classifier competence for dynamic ensemble selection, Pattern Recognition 44 (2011) 2656–2668. """ if not isinstance(B, int): raise TypeError( 'Parameter B should be an integer. Currently B is {0}'.format( type(B))) if B <= 0 or B is None: raise ValueError( 'The parameter B should be higher than 0. Currently B is {0}'. format(B)) supports = np.asarray(supports) idx_correct_label = np.array(idx_correct_label) supports[supports > 1] = 1 N, C = supports.shape x = np.linspace(0, 1, B) x = np.matlib.repmat(x, N, C) a = npm.zeros(x.shape) for c in range(C): a[:, c * B:(c + 1) * B] = C * supports[:, c:c + 1] b = C - a # For extreme cases, with a or b equal to 0, add a small constant: eps = 1e-20 a[a == 0] = eps b[b == 0] = eps betaincj = betainc(a, b, x) C_src = np.zeros(N) for n in range(N): t = range((idx_correct_label[n]) * B, (idx_correct_label[n] + 1) * B) bc = betaincj[n, t] bi = betaincj[n, list(set(range(0, (C * B))) - set(t))] bi = npm.transpose(npm.reshape(bi, (B, C - 1), order='F')) C_src[n] = np.sum( np.multiply((bc[0, 1:] - bc[0, 0:-1]), np.prod((bi[:, 0:-1] + bi[:, 1:]) / 2, 0))) return C_src
def process(alfa, beta, cov_margin, keep_all_edited, line): """ Calculates, for a single line in a VCF formatted file, the confidence score based on depth of coverage and edit fraction %. :param line: string single vcf formatted line. :return confidence: float confidence value of the line :return return_string: basestring full vcf formatted line with confidence """ (chr, pos, dot, ref, alt, qual, filter, info, format, cond) = line.split("\t")[:10] if chr[0] == "#": print line, return # retrieve total number of reads mapping to position infos = info.split(";") (dp, i16) = infos[:2] assert dp[:2] == "DP" num_reads = int(dp[3:]) """ # retrieve numbers of A's and G's on forward and reverse strand assert i16[:3] == "I16", i16 (a_fwd, a_rev, g_fwd, g_rev) = (int(x) for x in i16[4:].split(",")[:4]) print("warning: i16 not available") """ dp4 = re.findall("DP4\=([\d\,]+)", info)[0] (a_fwd, a_rev, g_fwd, g_rev) = (int(x) for x in dp4.split(",")) a = a_fwd + a_rev g = g_fwd + g_rev num_reads = a + g edit_frac = g / float(num_reads) # calc smoothed counts and confidence G = g + alfa A = a + beta theta = G / float(G + A) ######## MOST IMPORTANT LINE ######## # calculates the confidence of theta as # P( theta < cov_margin | A, G) ~ Beta_theta(G, A) confidence = 1 - betainc(G, A, cov_margin) # keep 100% edited sites or toss if A == 0 and not keep_all_edited: confidence = 0 region = 'POSSIBLE_SNP' else: region = 'PASS' # print line in CONF format return_string = ("\t".join([chr, pos, str(num_reads), ref, alt, ""]) + "\t".join(str(round(y, 9)) for y in [ confidence, theta, edit_frac ]) + "\t".join(["", region, info, format, cond]) + "\n") return return_string
def qso_engine(time,data,error,ltau=3.,lvar=-1.7,sys_err=0.,return_model=False): """Calculates the fit quality of a damped random walk to a qso lightcurve. Written by N. Butler ([email protected]), Feb. 2010. Version 1.0 The formalism is from Rybicki & Press (1994; arXiv:comp-gas/9405004) Data are modelled with a covariance function Lij = 0.5*var*tau*exp(-|time_i-time_j|/tau) . Input: time - measurement times, typically days data - measured magnitudes error - uncertainty in measured magnitudes Output (dictionary): chi2/nu - classical variability measure chi2_qso/nu - for goodness of fit given fixed parameters chi2_qso/nu_extra - for parameter fitting, add to chi2/nu chi^2/nu_NULL - expected chi2/nu for non-qso variable signif_qso - significance chi^2/nu<chi^2/nu_NULL (rule out false alarm) signif_not_qso - significance chi^2/nu>1 (rule out qso) signif_vary - significance that source is variable class - resulting source type (ambiguous, not_qso, qso) model - time series prediction for each datum given all others (iff return_model==True) dmodel - model uncertainty, including uncertainty in data Notes: T = L^(-1) Data variance is D Full covariance C^(-1) = (L+D)^(-1) = T [T+D^(-1)]^(-1) D^(-1) Code takes advantage of the tridiagonality of T and T+D^(-1).""" out_dict={} out_dict['chi2_qso/nu']=999; out_dict['chi2_qso/nu_extra']=0.; out_dict['signif_qso']=0.; out_dict['signif_not_qso']=0.; out_dict['signif_vary']=0. out_dict['chi2_qso/nu_NULL']=0.; out_dict['chi2/nu']=0.; out_dict['nu']=0 out_dict['model']=[]; out_dict['dmodel']=[]; out_dict['class']='ambiguous' lvar0 = log10(0.5)+lvar+ltau ln = len(data) dt = abs(time[1:]-time[:-1]) # first make sure all dt>0 g=where(dt>0.)[0]; lg = len(g) # must have at least 2 data points if (lg<=0): return out_dict if (return_model): model = 1.*data; dmodel = -1.*error if (lg<ln): dt = dt[g] gg = zeros(lg+1,dtype='int64'); gg[1:] = g+1 dat = data[gg]; wt = 1./(sys_err**2+error[gg]**2) ln = lg+1 else: dat = 1.*data wt = 1./(sys_err**2+error**2) out_dict['nu'] = ln-1. varx = var(dat) dat0 = (dat*wt).sum()/wt.sum() out_dict['chi2/nu'] = ( (dat-dat0)**2*wt ).sum()/out_dict['nu'] # define tridiagonal matrix T = L^(-1) # sparse matrix form: ab[u + i - j, j] == a[i,j] i<=j, (here u=1) T = zeros((2,ln),dtype='float64') arg = dt*exp(-log(10)*ltau); ri = exp(-arg); ei = ri/(1.-ri)/(1+ri) T[0,1:] = -ei; T[1,:-1] = 1.+ri*ei; T[1,1:] += ri*ei; T[1,ln-1] += 1. T0 = median(T[1,:]); T /= T0 # equation for chi2_qso is [ (dat-x0) T Tp^(-1) D^(-1) (dat-x0) ] , where Tp=T+D^(-1) and D^(-1)=wt fac = exp(log(10)*lvar0)/T0 Tp = 1.*T; Tp[1,:] += wt*fac # solve Tp*z=y for z (y=wt*dat) b1 = (wt*dat).reshape((1,ln)) z = transpose( solveh_banded(Tp,transpose(b1)) ); z = z[0,:] c1 = wt.reshape((1,ln)) z0 = transpose( solveh_banded(Tp,transpose(c1)) ); z0 = z0[0,:] # original version which troubles solveh_banded: # (Tpc,z) = solveh_banded(Tp,(wt*dat).reshape((1,ln))); z = z[0,:] # (Tpc,z0) = solveh_banded(Tp,wt.reshape((1,ln))); z0 = z0[0,:] #finally, get u=T*z u = T[1,:]*z; u[1:] += T[0,1:]*z[:-1]; u[:-1] += T[0,1:]*z[1:] u0 = T[1,:]*z0; u0[1:] += T[0,1:]*z0[:-1]; u0[:-1] += T[0,1:]*z0[1:] # magnitude offset x0, error = 1./sqrt(u0sum) u0sum = u0.sum(); x0 = u.sum()/u0sum; # fit statistic out_dict['chi2_qso/nu'] = dot(dat-x0,u-u0*x0)/out_dict['nu'] # -2*log(likelihood) = chi2_qso + ldet_C + log(u0sum) # first term: use chi2_qso/nu for goodness of fit with fixed parameters; # all terms: use chi2_qso/nu + chi2_qso/nu_extra for fitting with variable parameters # get log of determinant for use later Tc = cholesky_banded(T) Tpc = cholesky_banded(Tp) ldet_Tp = 2*log(Tpc[1,:]).sum(); ldet_T = 2*log(Tc[1,:]).sum() ldet_C = ldet_Tp-ldet_T-log(wt).sum() out_dict['chi2_qso/nu_extra'] = (ldet_C + log(u0sum))/out_dict['nu'] # get trace of C^(-1) for significance calculation Tpm = chol_inverse_diag(Tpc) diagC = T[1,:]*wt*Tpm[1,:] diagC[:-1] += T[0,1:]*wt[0:-1]*Tpm[0,1:] diagC[1:] += T[0,1:]*wt[1:]*Tpm[0,1:] TrC = diagC.sum() # significance in sigma units (large means false alarm unlikely) # (expected value of chi2_qso under the NULL hypothesis is TrC*varx) out_dict['chi2_qso/nu_NULL'] = TrC*varx/out_dict['nu'] a=ln/2.; x = (out_dict['chi2_qso/nu']+1.e-8)/(out_dict['chi2_qso/nu_NULL']+out_dict['chi2_qso/nu']+1.e-8) prob = betainc(a,a,x) if (prob<=0): lprob = a*log(x) - log(a) + gammaln(2*a) - 2*gammaln(a) else: lprob = log( prob ) out_dict['signif_qso'] = lprob2sigma(lprob) a=ln/2.; x = 1./(1.+out_dict['chi2_qso/nu']) prob = betainc(a,a,x) if (prob<=0): lprob = a*log(x) - log(a) + gammaln(2*a) - 2*gammaln(a) else: lprob = log( prob ) out_dict['signif_not_qso'] = lprob2sigma(lprob) x = out_dict['chi2/nu']*out_dict['nu'] prob = gammaincc(0.5*out_dict['nu'],0.5*x) if (prob<=0): lprob = (0.5*out_dict['nu']-1)*log(x) - 0.5*x - 0.5*out_dict['nu']*log(2) - gammaln(0.5*out_dict['nu']) else: lprob = log( prob ) out_dict['signif_vary'] = lprob2sigma(lprob) if (out_dict['signif_vary']>3): if (out_dict['signif_qso']>3): out_dict['class']='qso' elif (out_dict['signif_not_qso']>3): out_dict['class']='not_qso' # best-fit model for the lightcurve if (return_model): model[gg] = dat - (u-u0*x0)/diagC dmodel[gg] = 1./sqrt(diagC) out_dict['model'] = model out_dict['dmodel'] = dmodel return out_dict
def f1(k, n, p): return np.log1p(-special.betainc(k + 1, n, 1 - p))
def IB1_cdf(x, b, p, q): z = (b / x) return 1 - betainc(p, q, z)
def __call__(self, t): return betainc(self.in_curve, self.out_curve, t)
def betainc1(a, b, x): if a == 0: return 1 return betainc(a, b, x)
def _cdf(self, x, n, p): k = floor(x) return special.betainc(n, k+1, p)
Visualizes the vector field in one 2D-plane of the phase-statemachine state space """ from numpy import * from matplotlib.pylab import * from mpl_toolkits.mplot3d import Axes3D from matplotlib import cm from scipy.special import betainc x0 = dot(linspace(0, 1, 500)[:, newaxis], ones((1, 500))) x1 = x0.T n = 25 x0diag = betainc(5, 5, linspace(-0.01, 1.01, n)) isolinevalues = linspace(0, 1, 10) def f_proposed1(x0, x1, x2=0.001): """ proposed function: Matrix X: X = x^nx1 . 1^1xn lambda = X @ X.T * 8 * (X*X + (X*X).T) / (X + X.T + 0.01)**4 """
def f(x): z = a * x / (a * x + b) return 1.0 - special.betainc(a / 2.0, b / 2.0, z)
def cox_snell(x): return special.betainc(2 / 3., 2 / 3., x) * special.beta( 2 / 3., 2 / 3.)
def ibeta(a, b, x): return special.betainc(a, b, x) * special.beta(a, b)