def __call__(self, x): from import scipy import scipy.stats as stats # some local bindings distribution = self.distribution sd = fpp = self.fpp nbins = self.nbins x = np.asanyarray(x) shape_orig = x.shape ndims = len(shape_orig) # (very) old numpy had different format of returned bins -- # there were not edges but center points. We care here about # center points, so we will transform edge points into center # points for newer versions of numpy numpy_center_points = externals.versions['numpy'] < (1, 1) # XXX May be just utilize OverAxis transformer? if ndims > 2: raise NotImplementedError, \ "TODO: add support for more than 2 dimensions" elif ndims == 1: x, sd = x[:, np.newaxis], 0 # lets transpose for convenience if sd == 0: x = x.T # Output p-values of x in null-distribution pvalues = np.zeros(x.shape) nulldist_number, positives_recovered = [], [] # finally go through all data nd = x.shape[1] if __debug__: if nd < x.shape[0]: warning("Number of features in DistPValue lower than number of" " items -- may be incorrect sd=%d was provided" % sd) for i, xx in enumerate(x): dist = stats.rdist(nd - 1, 0, 1) xx /= np.linalg.norm(xx) if fpp is not None: if __debug__: debug('TRAN_', "starting adaptive adjustment i=%d" % i) # Adaptive adjustment for false negatives: Nxx, xxx, pN_emp_prev = len(xx), xx, 1.0 Nxxx = Nxx indexes = np.arange(Nxx) """What features belong to Null-distribution""" while True: hist, bins = np.histogram(xxx, bins=nbins, normed=False) pdf = hist.astype(float) / Nxxx if not numpy_center_points: # if we obtain edge points for bins -- take centers bins = 0.5 * (bins[0:-1] + bins[1:]) bins_halfstep = (bins[1] - bins[2]) / 2.0 # theoretical CDF # was really unstable -- now got better ;) dist_cdf = dist.cdf(bins) # otherwise just recompute manually # dist_pdf = dist.pdf(bins) # dist_pdf /= np.sum(dist_pdf) # XXX can't recall the function... silly # probably could use np.integrate cdf = np.zeros(nbins, dtype=float) #dist_cdf = cdf.copy() dist_prevv = cdf_prevv = 0.0 for j in range(nbins): cdf_prevv = cdf[j] = cdf_prevv + pdf[j] #dist_prevv = dist_cdf[j] = dist_prevv + dist_pdf[j] # what bins fall into theoretical 'positives' in both tails p = (0.5 - np.abs(dist_cdf - 0.5) < fpp / 2.0) # amount in empirical tails -- if we match theoretical, we # should have total >= p pN_emp = np.sum(pdf[p]) # / (1.0 * nbins) if __debug__: debug( 'TRAN_', "empirical p=%.3f for theoretical p=%.3f" % (pN_emp, fpp)) if pN_emp <= fpp: # we are done break if pN_emp > pN_emp_prev: if __debug__: debug( 'TRAN_', "Diverging -- thus keeping last result " "with p=%.3f" % pN_emp_prev) # we better restore previous result indexes, xxx, dist = indexes_prev, xxx_prev, dist_prev break pN_emp_prev = pN_emp # very silly way for now -- just proceed by 1 bin keep = np.logical_and( xxx > bins[0], # + bins_halfstep, xxx < bins[-1]) # - bins_halfstep) if __debug__: debug( 'TRAN_', "Keeping %d out of %d elements" % (np.sum(keep), Nxxx)) # Preserve them if we need to "roll back" indexes_prev, xxx_prev, dist_prev = indexes, xxx, dist # we should remove those which we assume to be positives and # which should not belong to Null-dist xxx, indexes = xxx[keep], indexes[keep] # L2 renorm it xxx = xxx / np.linalg.norm(xxx) Nxxx = len(xxx) dist = stats.rdist(Nxxx - 1, 0, 1) Nindexes = len(indexes) Nrecovered = Nxx - Nindexes nulldist_number += [Nindexes] positives_recovered += [Nrecovered] if __debug__: if distribution == 'rdist': assert (dist.args[0] == Nindexes - 1) debug( 'TRAN', "Positives recovery finished with %d out of %d " "entries in Null-distribution, thus %d positives " "were recovered" % (Nindexes, Nxx, Nrecovered)) # And now we need to perform our duty -- assign p-values #dist = stats.rdist(Nindexes-1, 0, 1) pvalues[i, :] = dist.cdf(xx) # XXX we might add an option to transform it to z-scores? result = pvalues # charge conditional attributes # XXX might want to populate them for non-adaptive handling as well = nulldist_number = positives_recovered # transpose if needed if sd == 0: result = result.T return result
def all_dists(): # dists param were taken from scipy.stats official # documentaion examples # Total - 89 return { "alpha": stats.alpha(a=3.57, loc=0.0, scale=1.0), "anglit": stats.anglit(loc=0.0, scale=1.0), "arcsine": stats.arcsine(loc=0.0, scale=1.0), "beta": stats.beta(a=2.31, b=0.627, loc=0.0, scale=1.0), "betaprime": stats.betaprime(a=5, b=6, loc=0.0, scale=1.0), "bradford": stats.bradford(c=0.299, loc=0.0, scale=1.0), "burr": stats.burr(c=10.5, d=4.3, loc=0.0, scale=1.0), "cauchy": stats.cauchy(loc=0.0, scale=1.0), "chi": stats.chi(df=78, loc=0.0, scale=1.0), "chi2": stats.chi2(df=55, loc=0.0, scale=1.0), "cosine": stats.cosine(loc=0.0, scale=1.0), "dgamma": stats.dgamma(a=1.1, loc=0.0, scale=1.0), "dweibull": stats.dweibull(c=2.07, loc=0.0, scale=1.0), "erlang": stats.erlang(a=2, loc=0.0, scale=1.0), "expon": stats.expon(loc=0.0, scale=1.0), "exponnorm": stats.exponnorm(K=1.5, loc=0.0, scale=1.0), "exponweib": stats.exponweib(a=2.89, c=1.95, loc=0.0, scale=1.0), "exponpow": stats.exponpow(b=2.7, loc=0.0, scale=1.0), "f": stats.f(dfn=29, dfd=18, loc=0.0, scale=1.0), "fatiguelife": stats.fatiguelife(c=29, loc=0.0, scale=1.0), "fisk": stats.fisk(c=3.09, loc=0.0, scale=1.0), "foldcauchy": stats.foldcauchy(c=4.72, loc=0.0, scale=1.0), "foldnorm": stats.foldnorm(c=1.95, loc=0.0, scale=1.0), # "frechet_r": stats.frechet_r(c=1.89, loc=0.0, scale=1.0), # "frechet_l": stats.frechet_l(c=3.63, loc=0.0, scale=1.0), "genlogistic": stats.genlogistic(c=0.412, loc=0.0, scale=1.0), "genpareto": stats.genpareto(c=0.1, loc=0.0, scale=1.0), "gennorm": stats.gennorm(beta=1.3, loc=0.0, scale=1.0), "genexpon": stats.genexpon(a=9.13, b=16.2, c=3.28, loc=0.0, scale=1.0), "genextreme": stats.genextreme(c=-0.1, loc=0.0, scale=1.0), "gausshyper": stats.gausshyper(a=13.8, b=3.12, c=2.51, z=5.18, loc=0.0, scale=1.0), "gamma": stats.gamma(a=1.99, loc=0.0, scale=1.0), "gengamma": stats.gengamma(a=4.42, c=-3.12, loc=0.0, scale=1.0), "genhalflogistic": stats.genhalflogistic(c=0.773, loc=0.0, scale=1.0), "gilbrat": stats.gilbrat(loc=0.0, scale=1.0), "gompertz": stats.gompertz(c=0.947, loc=0.0, scale=1.0), "gumbel_r": stats.gumbel_r(loc=0.0, scale=1.0), "gumbel_l": stats.gumbel_l(loc=0.0, scale=1.0), "halfcauchy": stats.halfcauchy(loc=0.0, scale=1.0), "halflogistic": stats.halflogistic(loc=0.0, scale=1.0), "halfnorm": stats.halfnorm(loc=0.0, scale=1.0), "halfgennorm": stats.halfgennorm(beta=0.675, loc=0.0, scale=1.0), "hypsecant": stats.hypsecant(loc=0.0, scale=1.0), "invgamma": stats.invgamma(a=4.07, loc=0.0, scale=1.0), "invgauss": stats.invgauss(mu=0.145, loc=0.0, scale=1.0), "invweibull": stats.invweibull(c=10.6, loc=0.0, scale=1.0), "johnsonsb": stats.johnsonsb(a=4.32, b=3.18, loc=0.0, scale=1.0), "johnsonsu": stats.johnsonsu(a=2.55, b=2.25, loc=0.0, scale=1.0), "ksone": stats.ksone(n=1e03, loc=0.0, scale=1.0), "kstwobign": stats.kstwobign(loc=0.0, scale=1.0), "laplace": stats.laplace(loc=0.0, scale=1.0), "levy": stats.levy(loc=0.0, scale=1.0), "levy_l": stats.levy_l(loc=0.0, scale=1.0), "levy_stable": stats.levy_stable(alpha=0.357, beta=-0.675, loc=0.0, scale=1.0), "logistic": stats.logistic(loc=0.0, scale=1.0), "loggamma": stats.loggamma(c=0.414, loc=0.0, scale=1.0), "loglaplace": stats.loglaplace(c=3.25, loc=0.0, scale=1.0), "lognorm": stats.lognorm(s=0.954, loc=0.0, scale=1.0), "lomax": stats.lomax(c=1.88, loc=0.0, scale=1.0), "maxwell": stats.maxwell(loc=0.0, scale=1.0), "mielke": stats.mielke(k=10.4, s=3.6, loc=0.0, scale=1.0), "nakagami": stats.nakagami(nu=4.97, loc=0.0, scale=1.0), "ncx2": stats.ncx2(df=21, nc=1.06, loc=0.0, scale=1.0), "ncf": stats.ncf(dfn=27, dfd=27, nc=0.416, loc=0.0, scale=1.0), "nct": stats.nct(df=14, nc=0.24, loc=0.0, scale=1.0), "norm": stats.norm(loc=0.0, scale=1.0), "pareto": stats.pareto(b=2.62, loc=0.0, scale=1.0), "pearson3": stats.pearson3(skew=0.1, loc=0.0, scale=1.0), "powerlaw": stats.powerlaw(a=1.66, loc=0.0, scale=1.0), "powerlognorm": stats.powerlognorm(c=2.14, s=0.446, loc=0.0, scale=1.0), "powernorm": stats.powernorm(c=4.45, loc=0.0, scale=1.0), "rdist": stats.rdist(c=0.9, loc=0.0, scale=1.0), "reciprocal": stats.reciprocal(a=0.00623, b=1.01, loc=0.0, scale=1.0), "rayleigh": stats.rayleigh(loc=0.0, scale=1.0), "rice": stats.rice(b=0.775, loc=0.0, scale=1.0), "recipinvgauss": stats.recipinvgauss(mu=0.63, loc=0.0, scale=1.0), "semicircular": stats.semicircular(loc=0.0, scale=1.0), "t": stats.t(df=2.74, loc=0.0, scale=1.0), "triang": stats.triang(c=0.158, loc=0.0, scale=1.0), "truncexpon": stats.truncexpon(b=4.69, loc=0.0, scale=1.0), "truncnorm": stats.truncnorm(a=0.1, b=2, loc=0.0, scale=1.0), "tukeylambda": stats.tukeylambda(lam=3.13, loc=0.0, scale=1.0), "uniform": stats.uniform(loc=0.0, scale=1.0), "vonmises": stats.vonmises(kappa=3.99, loc=0.0, scale=1.0), "vonmises_line": stats.vonmises_line(kappa=3.99, loc=0.0, scale=1.0), "wald": stats.wald(loc=0.0, scale=1.0), "weibull_min": stats.weibull_min(c=1.79, loc=0.0, scale=1.0), "weibull_max": stats.weibull_max(c=2.87, loc=0.0, scale=1.0), "wrapcauchy": stats.wrapcauchy(c=0.0311, loc=0.0, scale=1.0), }
def __call__(self, x): from import scipy import scipy.stats as stats # some local bindings distribution = self.distribution sd = fpp = self.fpp nbins = self.nbins x = np.asanyarray(x) shape_orig = x.shape ndims = len(shape_orig) # (very) old numpy had different format of returned bins -- # there were not edges but center points. We care here about # center points, so we will transform edge points into center # points for newer versions of numpy numpy_center_points = externals.versions['numpy'] < (1, 1) # XXX May be just utilize OverAxis transformer? if ndims > 2: raise NotImplementedError, \ "TODO: add support for more than 2 dimensions" elif ndims == 1: x, sd = x[:, np.newaxis], 0 # lets transpose for convenience if sd == 0: x = x.T # Output p-values of x in null-distribution pvalues = np.zeros(x.shape) nulldist_number, positives_recovered = [], [] # finally go through all data nd = x.shape[1] if __debug__: if nd < x.shape[0]: warning("Number of features in DistPValue lower than number of" " items -- may be incorrect sd=%d was provided" % sd) for i, xx in enumerate(x): dist = stats.rdist(nd-1, 0, 1) xx /= np.linalg.norm(xx) if fpp is not None: if __debug__: debug('TRAN_', "starting adaptive adjustment i=%d" % i) # Adaptive adjustment for false negatives: Nxx, xxx, pN_emp_prev = len(xx), xx, 1.0 Nxxx = Nxx indexes = np.arange(Nxx) """What features belong to Null-distribution""" while True: hist, bins = np.histogram(xxx, bins=nbins, normed=False) pdf = hist.astype(float)/Nxxx if not numpy_center_points: # if we obtain edge points for bins -- take centers bins = 0.5 * (bins[0:-1] + bins[1:]) bins_halfstep = (bins[1] - bins[2])/2.0 # theoretical CDF # was really unstable -- now got better ;) dist_cdf = dist.cdf(bins) # otherwise just recompute manually # dist_pdf = dist.pdf(bins) # dist_pdf /= np.sum(dist_pdf) # XXX can't recall the function... silly # probably could use np.integrate cdf = np.zeros(nbins, dtype=float) #dist_cdf = cdf.copy() dist_prevv = cdf_prevv = 0.0 for j in range(nbins): cdf_prevv = cdf[j] = cdf_prevv + pdf[j] #dist_prevv = dist_cdf[j] = dist_prevv + dist_pdf[j] # what bins fall into theoretical 'positives' in both tails p = (0.5 - np.abs(dist_cdf - 0.5) < fpp/2.0) # amount in empirical tails -- if we match theoretical, we # should have total >= p pN_emp = np.sum(pdf[p]) # / (1.0 * nbins) if __debug__: debug('TRAN_', "empirical p=%.3f for theoretical p=%.3f" % (pN_emp, fpp)) if pN_emp <= fpp: # we are done break if pN_emp > pN_emp_prev: if __debug__: debug('TRAN_', "Diverging -- thus keeping last result " "with p=%.3f" % pN_emp_prev) # we better restore previous result indexes, xxx, dist = indexes_prev, xxx_prev, dist_prev break pN_emp_prev = pN_emp # very silly way for now -- just proceed by 1 bin keep = np.logical_and(xxx > bins[0], # + bins_halfstep, xxx < bins[-1]) # - bins_halfstep) if __debug__: debug('TRAN_', "Keeping %d out of %d elements" % (np.sum(keep), Nxxx)) # Preserve them if we need to "roll back" indexes_prev, xxx_prev, dist_prev = indexes, xxx, dist # we should remove those which we assume to be positives and # which should not belong to Null-dist xxx, indexes = xxx[keep], indexes[keep] # L2 renorm it xxx = xxx / np.linalg.norm(xxx) Nxxx = len(xxx) dist = stats.rdist(Nxxx-1, 0, 1) Nindexes = len(indexes) Nrecovered = Nxx - Nindexes nulldist_number += [Nindexes] positives_recovered += [Nrecovered] if __debug__: if distribution == 'rdist': assert(dist.args[0] == Nindexes-1) debug('TRAN', "Positives recovery finished with %d out of %d " "entries in Null-distribution, thus %d positives " "were recovered" % (Nindexes, Nxx, Nrecovered)) # And now we need to perform our duty -- assign p-values #dist = stats.rdist(Nindexes-1, 0, 1) pvalues[i, :] = dist.cdf(xx) # XXX we might add an option to transform it to z-scores? result = pvalues # charge conditional attributes # XXX might want to populate them for non-adaptive handling as well = nulldist_number = positives_recovered # transpose if needed if sd == 0: result = result.T return result
c = 0.9 mean, var, skew, kurt = rdist.stats(c, moments='mvsk') # Display the probability density function (``pdf``): x = np.linspace(rdist.ppf(0.01, c), rdist.ppf(0.99, c), 100) ax.plot(x, rdist.pdf(x, c), 'r-', lw=5, alpha=0.6, label='rdist pdf') # Alternatively, the distribution object can be called (as a function) # to fix the shape, location and scale parameters. This returns a "frozen" # RV object holding the given parameters fixed. # Freeze the distribution and display the frozen ``pdf``: rv = rdist(c) ax.plot(x, rv.pdf(x), 'k-', lw=2, label='frozen pdf') # Check accuracy of ``cdf`` and ``ppf``: vals = rdist.ppf([0.001, 0.5, 0.999], c) np.allclose([0.001, 0.5, 0.999], rdist.cdf(vals, c)) # True # Generate random numbers: r = rdist.rvs(c, size=1000) # And compare the histogram: ax.hist(r, density=True, histtype='stepfilled', alpha=0.2)