def norminvcdf(qin): eps = 1.e-320 batch_ppf_fn = lambda q: norm.ppf(1.-1.e-16) if q==1. else norm.ppf(q+eps) if(len(qin)==1): return batch_ppf_fn(qin) else: return np.array(map(batch_ppf_fn, qin))
def dprime_yes_no(H, FA): """ Compute *d'* for one interval 'yes/no' type tasks from hits and false alarm rates. Parameters ---------- H : float Hit rate. FA : float False alarms rate. Returns ------- dprime : float *d'* value Examples -------- >>> dp = dprime_yes_no(0.7, 0.2) References ---------- .. [1] Green, D. M., & Swets, J. A. (1988). *Signal Detection Theory and Psychophysics*. Los Altos, California: Peninsula Publishing. .. [2] Macmillan, N. A., & Creelman, C. D. (2004). *Detection Theory: A User’s Guide (2nd ed.)*. London: Lawrence Erlbraum Associates. """ if H < 0 or H > 1: raise ValueError("H must be between 0 and 1") if FA < 0 or FA > 1: raise ValueError("FA must be between 0 and 1") return norm.ppf(H) - norm.ppf(FA)
def compute_null_stats(self, elec_pair_phase_diff, recalled, elec_pair_stats): res = Parallel(n_jobs=12, verbose=5)(delayed(calc_circ_stats)(elec_pair_phase_diff, recalled, True) for _ in range(self.n_perms)) # for the rayleigh z and the resultant vector length, compute the actual difference between good and bad # memory at each timepoint. Then compute a null distribution from shuffled data. Then compute the rank of the # real data compared to the shuffled at each timepoint. Convert rank to z-score and return null_elec_pair_zs_rec = np.stack([x['elec_pair_z_rec'] for x in res], 0) null_elec_pair_zs_nrec = np.stack([x['elec_pair_z_nrec'] for x in res], 0) null_delta_mem_zs = null_elec_pair_zs_rec - null_elec_pair_zs_nrec real_delta_mem_zs = elec_pair_stats['elec_pair_z_rec'] - elec_pair_stats['elec_pair_z_nrec'] delta_mem_zs_rank = np.mean(real_delta_mem_zs > null_delta_mem_zs, axis=0) delta_mem_zs_rank[delta_mem_zs_rank == 0] += 1/self.n_perms delta_mem_zs_rank[delta_mem_zs_rank == 1] -= 1 / self.n_perms null_elec_pair_rvls_rec = np.stack([x['elec_pair_rvl_rec'] for x in res], 0) null_elec_pair_rvls_nrec = np.stack([x['elec_pair_rvl_nrec'] for x in res], 0) null_delta_mem_rvls = null_elec_pair_rvls_rec - null_elec_pair_rvls_nrec real_delta_mem_rvls = elec_pair_stats['elec_pair_rvl_rec'] - elec_pair_stats['elec_pair_rvl_nrec'] delta_mem_rvls_rank = np.mean(real_delta_mem_rvls > null_delta_mem_rvls, axis=0) delta_mem_rvls_rank[delta_mem_rvls_rank == 0] += 1/self.n_perms delta_mem_rvls_rank[delta_mem_rvls_rank == 1] -= 1 / self.n_perms return norm.ppf(delta_mem_zs_rank), norm.ppf(delta_mem_rvls_rank)
def draw_tile(metadata, config, target_path): decoder = config.build_decoder() decoder_layers = nn.layers.get_all_layers(decoder.l_out) print " decoder layer output shapes:" nparams = len(nn.layers.get_all_params(decoder.l_out)) nn.layers.set_all_param_values(decoder.l_out, metadata['param_values'][-nparams:]) for layer in decoder_layers: name = layer.__class__.__name__ print " %s %s" % (string.ljust(name, 32), nn.layers.get_output_shape(layer)) mesh = np.linspace(0.001, 0.999, 20) z = np.zeros((400, 2), dtype='float32') for i in xrange(20): for j in xrange(20): z[20 * i + j, :] = np.array([norm.ppf(mesh[i]), norm.ppf(mesh[j])]) sample = theano.function([decoder.l_z.input_var], nn.layers.get_output(decoder_layers[-1])) digits = sample(z) tile = np.zeros((20 * 28, 20 * 28), dtype='float32') for i in xrange(20): for j in xrange(20): d = np.reshape(digits[20 * i + j, :], (28, 28)) tile[i * 28:(i + 1) * 28, j * 28:(j + 1) * 28] = d plt.imsave(target_path + 'tile.png', tile, cmap=matplotlib.cm.Greys)
def bca(data, alphas, statarray, statfunction, ostat, reps): '''Subroutine called to calculate the BCa statistics. Borrowed heavily from scikits.bootstrap code.''' # The bias correction value. z0=norm.ppf( ( 1.0*np.sum(statarray < ostat, axis=0) ) / reps ) # Statistics of the jackknife distribution jackindexes=jackknife_indexes(data[0]) # I use the scikits.bootstrap function here. jstat=[statfunction(*(x[indexes] for x in data)) for indexes in jackindexes] jmean=np.mean(jstat,axis=0) # Acceleration value a=np.sum( (jmean - jstat)**3, axis=0 ) / ( 6.0 * np.sum( (jmean - jstat)**2, axis=0)**1.5 ) if np.any(np.isnan(a)): nanind=np.nonzero(np.isnan(a)) warnings.warn("Some acceleration values were undefined. \ This is almost certainly because all values \ for the statistic were equal. Affected \ confidence intervals will have zero width and \ may be inaccurate (indexes: {}). \ Other warnings are likely related.".format(nanind)) zs=z0 + norm.ppf(alphas).reshape(alphas.shape+(1,)*z0.ndim) avals=norm.cdf(z0 + zs/(1-a*zs)) nvals=np.round((reps-1)*avals) nvals=np.nan_to_num(nvals).astype('int') return nvals
def plot3(A,B,S,T,D, tMAP=None): States = D.Y pMAP = [9.75961269, 0.0583687877, 78.4901534, 78.1696975] alpha, beta, sig2, ome2 = pMAP tMAP = (alpha, beta, sig2, ome2, States, D.Y) x1 = np.linspace(norm.ppf(0.01), norm.ppf(0.99), 100) x2 = np.linspace(invgamma.ppf(0.01, S.prior.p1/2), invgamma.ppf(0.99, S.prior.p1/2), 100) x3 = np.linspace(invgamma.ppf(0.01, T.prior.p1/2), invgamma.ppf(0.99, T.prior.p1/2), 100) tmp = lambda (a, b, s, t, X, Y): 1/(1/A.prior.var + D.n/S.prior.p2) print tMAP print tmp(tMAP) print A.cond.var(tMAP) print A.cond.m(tMAP) print A.cond.rv((tMAP)) plt.plot(x1, A.cond.rv(tMAP).pdf(x1), 'r-', lw=5, alpha=0.6, label='a prior') plt.title('alpha conditional at MAP') plt.show() plt.plot(x1, B.cond.rv(tMAP).pdf(x1), 'r-', lw=5, alpha=0.6, label='b prior') plt.title('beta conditional at MAP') plt.show() plt.plot(x2, S.cond.rv(tMAP).pdf(x2), 'r-', lw=5, alpha=0.6, label='sig2 prior') plt.title('sig2 conditional at MAP') plt.show() plt.plot(x3, T.cond.rv(tMAP).pdf(x3), 'r-', lw=5, alpha=0.6, label='sig2 prior') plt.title('sig2 conditional at MAP') plt.show()
def __init__(self, MAX_DUR, usePersistentProbs, distributionType=1): ''' ''' # 1 - normal. # 2 - gamma TODO: implement # 3 - exponential self.distributionType = distributionType self.numDurs = int(2* deviationInSec * NUMFRAMESPERSEC) if not self.numDurs % 2: self.numDurs += 1 ''' maxDur x currDur lookupTable of probs ''' self.R_MAX = MAX_DUR ''' how much of a phoneme may be longer than its score-assigned max_dur ''' self.MAX_ALLOWED_DURATION_RATIO = 1 if distributionType == 1: self.MAX_ALLOWED_DURATION_RATIO = 2 self.lookupTableLogLiks = numpy.empty((MAX_DUR, self.R_MAX + (self.numDurs-1) /2 + 1)) self.lookupTableLogLiks.fill(-Infinity) if distributionType == 1: self.minVal = norm.ppf(0.01) self.maxVal= norm.ppf(0.99) self._constructLogLiksTable(usePersistentProbs)
def adjust_thresholds(self, val_data, alpha, batchsize, nrepeats=10000, maxrepeats=1e6): self.thr_Z = normal_dist.ppf(alpha, loc=self.valmean, scale=self.valstd/np.sqrt(batchsize)), np.inf # no upper bound, only lower self.thr_logZ = normal_dist.ppf(alpha, loc=self.vallogmean, scale=self.vallogstd/np.sqrt(batchsize)), np.inf # no upper bound, only lower self.thr_symZ = normal_dist.ppf(0.5*alpha, loc=self.valmean, scale=self.valstd/np.sqrt(batchsize)), normal_dist.ppf(1.-0.5*alpha, loc=self.valmean, scale=self.valstd/np.sqrt(batchsize)) self.thr_symlogZ = normal_dist.ppf(0.5*alpha, loc=self.vallogmean, scale=self.vallogstd/np.sqrt(batchsize)), normal_dist.ppf(1.-0.5*alpha, loc=self.vallogmean, scale=self.vallogstd/np.sqrt(batchsize)) nrepeats = max( nrepeats, np.int(np.ceil(2./alpha)) ) if nrepeats <= maxrepeats: mean_stat,logmean_stat = [],[] for i in range(nrepeats): batch = make_batch(val_data, batchsize) mean_stat.append( np.mean(batch) ) logmean_stat.append( np.mean(np.log(batch) ) ) mean_stat = np.sort(mean_stat) logmean_stat = np.sort(logmean_stat) index = np.int(np.floor(alpha*nrepeats)) # number of permitted outliers self.thr_mean = mean_stat[index], np.inf self.thr_logmean = logmean_stat[index], np.inf self.thr_symmean = mean_stat[(index-1)//2], mean_stat[-index//2] self.thr_symlogmean = logmean_stat[(index-1)//2], mean_stat[-index//2] else: # disable tests self.thr_mean = -np.inf, np.inf self.thr_logmean = -np.inf, np.inf self.thr_symmean = -np.inf, np.inf self.thr_symlogmean = -np.inf, np.inf
def d_prime(hits, false_alarms, n, nafc=1): """ Calculate the sensitivity index d'. Parameters ---------- hits : float The number of hits when detecting a signal. false_alarms : float The number of false alarms. n : int The number of trials in target and no-target trials. nafc : int, optional The number of alternative choices in the task. A value of ``1`` implies a Yes/No task. Defaults to 1. Returns ------- d : float The calculated d' value, z(hit_rate) - z(fa_rate). Example ------- >>> from pphelper import sdt >>> sdt.d_prime(20, 10, 25) 1.094968336708714 """ if nafc != 1: raise NotImplementedError('Only 1-AFC implemented so far.') hit_rate, fa_rate = _calculate_hit_and_fa_rates(hits, false_alarms, n) d = norm.ppf(hit_rate) - norm.ppf(fa_rate) return d
def plot_quantiles(path = None): """ Plot definition of quantile """ vals = np.random.randn(1000,) fig = plt.figure(figsize=(10, 3)) quantiles = [.05, .25, .5, .75, .95] ax = fig.add_subplot(121) ax.plot(vals, norm.cdf(vals), marker = "o", linestyle = "", markersize = 1.5) ax.set_xlim(-2.5, 2.5) ax.set_title("Cumulative distribution function") ax.set_xlabel("Values") ax.set_ylabel("Probability") for i,e in enumerate(quantiles): ax.axvline(x = norm.ppf(e), color = "red") ax = fig.add_subplot(122) ax.plot(vals, color = "black") ax.set_title("Vector samples from Gauusian distribution") ax.set_xlabel("Index") ax.set_ylabel("Values") for i,e in enumerate(quantiles): ax.axhline(y = norm.ppf(e), color = "red") if path: plt.savefig(path ,dpi = 300, bbox_inches='tight') else: plt.show()
def generate(estimator): from scipy.stats import norm n = 15 # Figure row size figure = np.zeros((28 * n, 28 * n)) # Random normal distributions to feed network with x_axis = norm.ppf(np.linspace(0.05, 0.95, n)) y_axis = norm.ppf(np.linspace(0.05, 0.95, n)) samples = [] for i, x in enumerate(x_axis): for j, y in enumerate(y_axis): samples.append(np.array([x, y], dtype=np.float32)) samples = np.array(samples) x_reconstructed = estimator.generate( plx.processing.numpy_input_fn({'samples': samples}, batch_size=n * n, shuffle=False)) results = [x['results'] for x in x_reconstructed] for i, x in enumerate(x_axis): for j, y in enumerate(y_axis): digit = results[i * n + j].reshape(28, 28) figure[i * 28: (i + 1) * 28, j * 28: (j + 1) * 28] = digit try: import matplotlib.pyplot as plt plt.figure(figsize=(10, 10)) plt.imshow(figure, cmap='Greys_r') plt.show() except ImportError: pass
def get_NGS(audio): """ Still need to test with Gauss vs Gauss Should be 0 I think. That means the Non-Gaussianity is low because it is actually a Guassian distribition This isnt correct - producing a NGS of 0.999something for almost all coughs. defs not right. need to test """ PLOT = 0 # Number of samples in segment N = len(audio) # first get inverse of CDF for normal distribution # N point guassian distribution gauss_sorted = np.sort(np.random.randn(N)) p_gauss = 1. * np.arange(N) / (N-1) g = norm.ppf(p_gauss)[1:-2] mu = np.mean(audio) sig = np.std(audio) data_sorted = np.sort(audio) # Get the CDF of the audio data p = 1. * np.arange(N) / (N-1) # Get the inverse of the CDF # throw away the first and last elements because # they are -inf and inf gamma = norm.ppf(p,loc = mu,scale=sig)[1:-2] NGS = 1.0 - ( np.sum((g - np.mean(g)**2)) / np.sum(gamma - np.mean(gamma)**2) ) if PLOT == 1: plt.figure(1) plt.subplot(2,1,1) plt.plot(gauss_sorted,p_gauss) plt.ylabel("$p$") plt.xlabel("$x$") plt.subplot(2,1,2) plt.plot(p_gauss[1:-2],g) plt.xlabel("$p$") plt.ylabel("") plt.show() plt.figure(2) plt.title("CDF and PPF with NGS = %f"%NGS) plt.subplot(2,1,1) plt.plot(data_sorted,p) plt.ylabel("$p$") plt.xlabel("$x$") plt.subplot(2,1,2) plt.plot(p[1:-2],gamma) plt.show() return NGS
def get_power(effect_size, N, p1, p2, significance, two_sided): # assumption 1: n1=n2 # assumption 2: one-sided test p2 = p1 - effect_size # Our random var is the difference between event rate p1 and event rate p2. # So the variance of our random variable is Var[x] = Var[p1] + Var[p2] sigma = np.sqrt(p1*(1-p1) + p2*(1-p2)) if two_sided: Z_crit = norm.ppf((1- (1-significance)/2 )) else: Z_crit = norm.ppf((1- (1-significance) )) # Note: our random var is the difference between control and test, so for every pair of # control/test observations, we have only one observation for our rand var. Thus, use n_control # or n_test but do not use n_total. Hence the N/2 sizes in the formulas below. if two_sided: power2 = 1 - norm.cdf(Z_crit - effect_size * np.sqrt(N/2)/sigma) + norm.cdf(-Z_crit - effect_size * np.sqrt(N/2)/sigma) else: power2 = 1 - norm.cdf(Z_crit - effect_size * np.sqrt(N/2)/sigma) return power2, Z_crit
def rank(self, x, y): cnts = y.value_counts() scores = [] def e(x, y): return -x / (x + y) * math.log(x / (x + y)) - y / (x + y) * math.log(y / (x + y)) for c in x.columns: true_positives = float(np.count_nonzero(np.logical_and(x[c], y))) false_positives = float(np.count_nonzero(np.logical_and(x[c], np.logical_not(y)))) pos = float(cnts[1]) neg = float(cnts[0]) n = pos + neg if self.type == 'bns': tpr = max(0.0005, true_positives / pos) fpr = max(0.0005, false_positives / neg) tpr = min(.9995, tpr) fpr = min(.9995, fpr) score = abs(norm.ppf(tpr) - norm.ppf(fpr)) elif self.type == 'acc': score = abs(tpr - fpr) elif self.type == 'ig': score = e(pos, neg) - ( (true_positives + false_positives) / n * e(true_positives, false_positives) + (1 - (true_positives + false_positives) / n) * e(pos - true_positives, neg - false_positives)) scores.append((score, c)) scores.sort(reverse=True) return scores
def binormal_separation(matrix, index): """ Calculates the binormal separation between a segment extension and each words it contains. Parameters ---------- matrix: karl.Matrix The matrix in which the association is to be calculated index: list of booleans or list of integers Indexes segments which form the extension with which the association is to be measured. """ a,b,c,d, unifs = _get_abcd(matrix, index) tpr = a / np.array(a + c, dtype=float) fpr = b / np.array(b + d, dtype=float) BNS = abs(norm.ppf(tpr)-norm.ppf(fpr)) return sorted(zip(BNS, unifs), reverse=True)
def dualDigiPriceAnalytics(p1, p2): start = time() x1k = norm.ppf(1 - p1) x2k = norm.ppf(1 - p2) def jointDensity(y, x, rho): scale = 1 / (2 * sp.pi * sp.sqrt(1 - rho * rho)) expo = sp.exp(-(y ** 2 + x ** 2 - 2 * rho * y * x) / (2 * (1 - rho ** 2))) return (scale * expo) calPrice = lambda rho : dblquad(jointDensity, x2k, sp.inf, lambda x : x1k, lambda x : sp.inf, args=(rho,))[0] rhoVec = np.linspace(-.99, .99, num=61) opsPrice = [calPrice(rho) for rho in rhoVec] end = time() print "time: ", end - start plt.plot(rhoVec, opsPrice) plt.title("Dual digital option price and correlation rho") plt.xlabel("Correlation (rho)") plt.ylabel("Option Price ($)") plt.show() return
def test (self): if self.n > 10: self._set_S() self._computer_var_S() if self.S > 0: z = (self.S - 1)/np.sqrt(self.var_S) elif self.S == 0: z = 0 elif self.S < 0: z = (self.S + 1)/np.sqrt(self.var_S) # calculate the p_value p = 2*(1-norm.cdf(abs(z))) h = abs(z) > norm.ppf(1-self.alpha/2) if h: if z >= norm.ppf(1-self.alpha): m='+' elif z <= norm.ppf(1-self.alpha): m='-' else: m=None return h,m,p else: print 'Test can only be run on a series of more than 10' return None,None,None
def sampleSize_twoGroups(d, alpha=0.05, beta=0.2, sigma1=1, sigma2=1): '''Sample size for two groups.''' n = round((norm.ppf(1-alpha/2.) + norm.ppf(1-beta))**2 * (sigma1**2 + sigma2**2) / d**2) print('In order to detect a change of {0} between groups with an SD of {1} and {2},'.format(d, sigma1, sigma2)) print('with significance {0} and test-power {1}, you need in each group at least {2:d} subjects.'.format(alpha, 100*(1-beta), int(n)))
def inverse_local(local_prob, hyper): n_group = len(local_prob) /3 a = norm.ppf(local_prob[0:n_group], hyper[0], scatter) b = norm.ppf(local_prob[n_group:2*n_group], hyper[1], scatter) c = norm.ppf(local_prob[2*n_group:], hyper[2], scatter) local = np.hstack((a,b,c)) return local
def real_position_to_abstract(self, p0): """ :param p0: :return: """ return [norm.ppf((p0[0] + 1500) / 3000.), norm.ppf(p0[1] / 2000.)]
def sampleSize_oneGroup(d, alpha=0.05, beta=0.2, sigma=1): '''Sample size for a single group.''' n = round((norm.ppf(1-alpha/2.) + norm.ppf(1-beta))**2 * sigma**2 / d**2) print('In order to detect a change of {0} in a group with an SD of {1},'.format(d, sigma)) print('with significance {0} and test-power {1}, you need at least {2:d} subjects.'.format(alpha, 100*(1-beta), int(n)))
def bcpcl(T,T_p,N_sigma): ''' Calculates the bias corrected percent confidence limits. -- Suppose that we have observed data (y1, y2, ..., yn) and use it to estimate a population parameter Q (e.g. Q could be the true mean of the entire population). -- T is a statistic that estimates Q. For example T could be an estimate of the true mean by calculating the mean of (y1, y2, ..., yn). -- Suppose that we create m bootstrap samples (y_p_1j, y_p_2j, ...,j_p_nj) from observed sample (y1, y2, ..., yn), where j is the jth bootstrap sample. -- Then T_p_j is the jth bootstrap observation of T. For example this could be the mean of (y_p_1j, y_p_2j, ...,j_p_nj). T = [float] e.g. biweight Location for (y1, y2, ..., yn) T_p = [vector array] biwieght Locations for the bootstrap samples N_sigma = the number of sigma to report the confidence limits for e.g. for 95% confidence limits N_sigma=2 Return (lower, upper) confidence limits ''' #Percentile confidence interval is defined as 100%(1-a), thus for 1sigma a=0.32 a = 1-erf(N_sigma/numpy.sqrt(2)) #order the bootstrap sample values smallest to largest index = numpy.argsort(T_p) T_p = T_p[index] #Number of bootstrap samples m = numpy.size(T_p) #Calculate the bias correction term mask = T_p < T z_0 = norm.ppf(numpy.sum(mask)/m) #Calculate the a1 and a2 values a1 = norm.cdf(2*z_0+norm.ppf(a/2)) a2 = norm.cdf(2*z_0+norm.ppf(1-a/2)) #Calculate the lower and upper indicies of lower and upper confidence intervals id_L = numpy.int(m*a1)-1 id_U = numpy.int(m*a2) #Find the lower an upper confidence values T_L = T_p[id_L] T_U = T_p[id_U] return T_L, T_U
def main(): import argparse parser = argparse.ArgumentParser() parser.add_argument('model', help='model file to load') parser.add_argument('dset', choices=['mnist']) args = parser.parse_args() with open(args.model, 'rb') as f: model = pickle.load(f) if args.dset == 'mnist': S = (28, 28) M = 20 manifold = np.zeros((S[0]*M, S[1]*M), dtype=theano.config.floatX) for z1 in xrange(M): for z2 in xrange(M): print z1, z2 z = np.zeros((1, 2)) # pass unit square through inverse Gaussian CDF z[0, 0] = norm.ppf(z1 * 1.0/M + 1.0/(M * 2)) z[0, 1] = norm.ppf(z2 * 1.0/M + 1.0/(M * 2)) z = np.array(z, dtype=theano.config.floatX) x_hat = model.decode(z) x_hat = x_hat.reshape(S) manifold[z1 * S[0]:(z1 + 1) * S[0], z2 * S[1]:(z2 + 1) * S[1]] = x_hat plt.imshow(manifold, cmap='Greys_r') plt.axis('off') plt.show()
def z_effect(counts, power, alpha=0.05): """Estimates the effect size for power based on the z distribution This is based on the equations in Lui, X.S. (2014) *Statistical power analysis for the social and behavioral sciences: basic and advanced techniques.* New York: Routledge. 378 pg. The equation assumes a positive magnitude to the effect size and a two-tailed test. Parameters ---------- counts : array The number of observations for each power depth power : array The statistical power at the depth specified by `counts` alpha : float, optional The critial value used to calculate the power Returns ------- ndarray A standard measure of the difference between the underlying populations """ power = np.atleast_2d(power) z_diff = z.ppf(power) + z.ppf(1 - alpha/2) eff = np.sqrt(np.square(z_diff) / counts) eff[power == 1] = np.nan eff[np.isinf(eff)] = np.nan return eff
def cmc(g, xdists, u_to_x, T, seed, maxitr): """ Crude Monte Carlo simulation. """ # Seed the random number generator if required if seed == -1: prng = RandomState() else: prng = RandomState(seed) # Generate standard normal samples centered at the origin u0 = zeros(len(xdists)) covmat = eye(len(xdists)) u = prng.multivariate_normal(u0, covmat, size=maxitr).T g_mc = g(u_to_x(u, xdists, T)) # Convert g-function output to pass/fail indicator function and estimate pf g_mc[g_mc>0] = 0 g_mc[g_mc<0] = 1 mu_pf = g_mc.mean() beta = -norm.ppf(mu_pf) if mu_pf < 0.5 else norm.ppf(mu_pf) # Convergence metrics (standard deviation, standard error, CoV of s.e.) std_pf = g_mc.std(ddof=1) # Calculate sample standard deviation se_pf = std_pf/sqrt(maxitr) cv_pf = se_pf/mu_pf return {'vars': xdists, 'beta': beta, 'Pf': mu_pf, 'stderr': se_pf, 'stdcv': cv_pf}
def mediation(var1,var2,var3, alpha=0.05, n_samples=10000, type='pearson', epsilon=0.001): if type.lower() == "pearson": statfunction = mediation_pearson if type.lower() == "kendalltau": statfunction = mediation_kendalltau if type.lower() == "spearman": statfunction = mediation_spearman # Deal with the alpha values if np.iterable(alpha): alphas = np.array(alpha) else: alphas = np.array([alpha/2,1-alpha/2]) data = (var1,var2,var3) # Ensure that the data is actually an array. This isn't nice to pandas, #data = tuple( np.array(x) for x in data ) # We don't need to generate actual samples; that would take more memory. # Instead, we can generate just the indexes, and then apply the statfun # to those indexes. bootindexes = bootstrap_indexes(len(var1), n_samples ) stat = np.array([statfunction(*(x[indexes] for x in data)) for indexes in bootindexes]) stat.sort(axis=0) # Bias-Corrected Accelerated Method # The value of the statistic function applied just to the actual data. ostat = statfunction(*data) # The bias correction value. z0 = norm.ppf( ( 1.0*np.sum(stat < ostat, axis=0) ) / n_samples ) # Statistics of the jackknife distribution jackindexes = jackknife_indexes(data[0]) jstat = [statfunction(*(x[indexes] for x in data)) for indexes in jackindexes] jmean = np.mean(jstat,axis=0) # Acceleration value a = np.sum( (jmean - jstat)**3, axis=0 ) / ( 6.0 * np.sum( (jmean - jstat)**2, axis=0)**1.5 ) zs = z0 + norm.ppf(alphas).reshape(alphas.shape+(1,)*z0.ndim) avals = norm.cdf(z0 + zs/(1-a*zs)) nvals = np.round((n_samples-1)*avals) if np.any(nvals==0) or np.any(nvals==n_samples-1): warnings.warn("Some values used extremal samples; results are probably unstable.", InstabilityWarning) elif np.any(nvals<10) or np.any(nvals>=n_samples-10): warnings.warn("Some values used top 10 low/high samples; results may be unstable.", InstabilityWarning) nvals = np.nan_to_num(nvals).astype('int') if nvals.ndim == 1: # All nvals are the same. Simple broadcasting return {"Estimate":statfunction(*data),"%.2f%% Confidence Interval" % round(1- alpha,2): stat[nvals]} else: # Nvals are different for each data point. Not simple broadcasting. # Each set of nvals along axis 0 corresponds to the data at the same # point in other axes. return {"Estimate":statfunction(*data),"%.2f%% Confidence Interval" % round(1- alpha,2): stat[(nvals, np.indices(nvals.shape)[1:].squeeze())]}
def est_sdt(f, h, m, r, rule='yn'): """Calculate maximum-likelihood estimates of sensitivity and bias. Args: f: False alarms. h: Hits. m: Misses. r: Correct rejections. rule: Name of decision rule. Returns: [(d1, c1) ...] """ out = [] for _f, _h, _m, _r in zip(f, h, m, r): n0, n1 = float(_f + _r), float(_h + _m) if _f == 0: _f += 0.5 if _f == n0: _f -= 0.5 if _h == 0: _h += 0.5 if _h == n1: _h -= 0.5 fhat = _f / float(n0) hhat = _h / float(n1) d = norm.ppf(hhat) - norm.ppf(fhat) c = -0.5 * (norm.ppf(hhat) + norm.ppf(fhat)) if rule == '2afc': d /= np.sqrt(2) c /= np.sqrt(2) out.append((d, c)) return out
def q2qnbinom(counts, input_mean, output_mean, dispersion): """ Quantile to Quantile for a negative binomial """ zero = logical_or(input_mean < 1e-14, output_mean < 1e-14) input_mean[zero] = input_mean[zero] + 0.25 output_mean[zero] = output_mean[zero] + 0.25 ri = 1 + multiply(np.matrix(dispersion).T, input_mean) vi = multiply(input_mean, ri) rO = 1 + multiply(np.matrix(dispersion).T, output_mean) vO = multiply(output_mean, rO) i = counts >= input_mean low = logical_not(i) p1 = empty(counts.shape, dtype=np.float64) p2 = p1.copy() q1, q2 = p1.copy(), p1.copy() if i.any(): p1[i] = norm.logsf(counts[i], loc=input_mean[i], scale=np.sqrt(vi[i]))[0, :] p2[i] = gamma.logsf(counts[i], (input_mean / ri)[i], scale=ri[i])[0, :] q1[i] = norm.ppf(1 - np.exp(p1[i]), output_mean[i], np.sqrt(vO[i]))[0, :] q2[i] = gamma.ppf(1 - np.exp(p2[i]), np.divide(output_mean[i], rO[i]), scale=rO[i])[0, :] if low.any(): p1[low] = norm.logcdf(counts[low], loc=input_mean[low], scale=np.sqrt(vi[low]))[0, :] p2[low] = gamma.logcdf(counts[low], input_mean[low] / ri[low], scale=ri[low])[0, :] q1[low] = norm.ppf(np.exp(p1[low]), loc=output_mean[low], scale=np.sqrt(vO[low]))[0, :] q2[low] = gamma.ppf(np.exp(p2[low]), output_mean[low] / rO[low], scale=rO[low])[0, :] return (q1 + q2) / 2
def t_to_z(mr, dof): data = mr.get_data() # Select just the nonzero voxels nonzero = data[data!=0] # We will store our results here Z = np.zeros(len(nonzero)) # Select values less than or == 0, and greater than zero c = np.zeros(len(nonzero)) k1 = (nonzero <= c) k2 = (nonzero > c) # Subset the data into two sets t1 = nonzero[k1] t2 = nonzero[k2] # Calculate p values for <=0 p_values_t1 = t.cdf(t1, df = dof) z_values_t1 = norm.ppf(p_values_t1) # Calculate p values for > 0 p_values_t2 = t.cdf(-t2, df = dof) z_values_t2 = -norm.ppf(p_values_t2) Z[k1] = z_values_t1 Z[k2] = z_values_t2 # Create new nifti empty_nii = np.zeros(mr.shape) empty_nii[mr.get_data()!=0] = Z Z_nii_fixed = nib.nifti1.Nifti1Image(empty_nii,affine=mr.get_affine(),header=mr.get_header()) return Z_nii_fixed
def simulate(): line = np.linspace(-100, 100, 201) X = norm.pdf(line, loc=5, scale=18) # Plotting the PDF and CDF of N(5,18) over the range of (-100, 100) pyplot.subplot(211) pyplot.plot(line, X) pyplot.title('PDF') CDF = np.cumsum(X) pyplot.subplot(212) pyplot.title('CDF') pyplot.plot(line, CDF) pyplot.show() # 1. P(X<8) print('P(X<8): ', norm.cdf(8, loc=5, scale=18)) # 2. P(X>-2) print('P(X>-2): ', 1 - norm.cdf(-2, loc=5, scale=18)) # 3. x such that P(X>x) = 0.05 print('x such that P(X>x) = 0.05: ', norm.ppf(0.95, loc=5, scale=18)) # 4. P(0<=X<4) print('P(0<=X<4: ', norm.cdf(4, loc=5, scale=18) - norm.cdf(0, loc=5, scale=18)) # 5. x such that P(abs(X) > abs(x)) = 0.05 print('x such that P(abs(X) > abs(x)) = 0.05: ', norm.ppf(0.975, loc=5, scale=18))
import numpy as np from scipy.stats import norm from scipy import special # Standard (Normal) Gaussian mu, sigma = 0, 1 def gaussian(x, mu, sigma): normal = (1.0 / np.sqrt(2 * np.pi * sigma**2)) distribution = np.exp(-(x - mu)**2 / (2 * sigma**2)) return normal * distribution # generate range of x values from -2.3 to 2.3 x = np.linspace(norm.ppf(0.01) * 2, norm.ppf(0.99) * 2, 100) # generate pdf (standard normal distribution) pdf = gaussian(x, mu, sigma) # Central Limit Theorem # Generate Samples from a Normal Distribution mu_original, sigma_original = 5, 10 sampleSizes = [100, 1000, 10000] fig = 1 for sample in sampleSizes: trails = np.arange(1, sample) # need to generate CLT statistic statistic = [] for n in trails: # s can be generated from any distribution s = (np.sum(np.random.normal(mu_original, sigma_original, n)) - n*mu_original) / \
def hall_sheather(n, q, alpha=.05): z = norm.ppf(q) num = 1.5 * norm.pdf(z)**2. den = 2. * z**2. + 1. h = n**(-1. / 3) * norm.ppf(1. - alpha / 2.)**(2./3) * (num / den)**(1./3) return h
def bofinger(n, q): num = 9. / 2 * norm.pdf(2 * norm.ppf(q))**4 den = (2 * norm.ppf(q)**2 + 1)**2 h = n**(-1. / 5) * (num / den)**(1. / 5) return h
tempReturns = np.log( tempPrices.iloc[1:, 1:4].astype(float).dropna() / tempPrices.iloc[1:, 1:4].shift(1).astype(float).dropna()) tempReturnsLast = tempReturns.iloc[-1, :] cov_matrix = tempReturns.cov() # tempReturns[tempReturns['Brent].cov() mean_returns = tempReturns.mean() tempRev = revenue[revenue['Date'] == date].reset_index(drop=True) weights = np.array( [tempRev['wtiWeight'], tempRev['brentWeight'], tempRev['gasWeight']]) port_mean = mean_returns.dot(weights).item() * 252 portMean.append(port_mean) port_stdev = np.sqrt(weights.T.dot(cov_matrix).dot(weights)).item() portStd.append(port_stdev) #one_day_var = norm.ppf(conf_level, port_mean, port_stdev) oneDayvar.append(norm.ppf(conf_level, port_mean, port_stdev)) annualVar.append( norm.ppf(conf_level, port_mean, port_stdev) * np.sqrt(252)) portReturn.append(tempReturnsLast.dot(weights).item()) #Component VaR #tempReturns.dot(weights).cumprod().plot() #.Portfolio.plot() df = pd.DataFrame({ 'dateId': dateId, 'portMean': portMean, 'portStd': portStd, 'oneDayvar': oneDayvar, 'annualVar': annualVar, 'portReturn': portReturn
import matplotlib.pyplot as plt # Constants EQUITY_INDEX_CUTOFFS = [0, 0.03, 0.07, 0.1, 0.15, 0.3, 1] # Parameters n = 1000 #names in credit index rho = 0.1 num_sims = 1000 prob_default = 0.25 # For equity tranche 0-20%, mezzanine 20%-80%: # tranche_cutoffs = [0, 0.2, 1 ] tranche_cutoffs = EQUITY_INDEX_CUTOFFS tranche_to_watch = 4 #1 is equity, 2 mezz, etc # derived parameters z_score_of_default = norm.ppf(prob_default) beta = rho**0.5 alpha = (1 - rho)**0.5 max_defaults_protected = int(round(n * tranche_cutoffs[tranche_to_watch - 1])) wiped_out_defaults = int(round(n * tranche_cutoffs[tranche_to_watch])) names_in_tranche = wiped_out_defaults - max_defaults_protected # run simulation trial_results = [] names_remaining_in_tranche = [] for _ in range(num_sims): M = random.gauss(0, 1) K = 0 # number of names defaulting for _ in range(n): R_i = beta * M + alpha * random.gauss(0, 1) if R_i < z_score_of_default:
def approx_exp_max_sharpe(mean_sharpe, var_sharpe, nb_trials): """Expected Maximum Sharpe Ratio.""" return mean_sharpe + np.sqrt(var_sharpe) * \ ((1 - np.euler_gamma) * norm.ppf(1 - 1 / nb_trials) + np.euler_gamma * norm.ppf(1 - 1 / (nb_trials * np.e)))
def estimate_ate(self, X, p, treatment, y, segment=None, return_ci=False): """Estimate the Average Treatment Effect (ATE). Args: X (np.matrix or np.array or pd.Dataframe): a feature matrix p (np.ndarray or pd.Series or dict): an array of propensity scores of float (0,1) in the single-treatment case; or, a dictionary of treatment groups that map to propensity vectors of float (0,1) treatment (np.array or pd.Series): a treatment vector y (np.array or pd.Series): an outcome vector segment (np.array, optional): An optional segment vector of int. If given, the ATE and its CI will be estimated for each segment. return_ci (bool, optional): Whether to return confidence intervals Returns: (tuple): The ATE and its confidence interval (LB, UB) for each treatment, t and segment, s """ check_treatment_vector(treatment, self.control_name) X, treatment, y = convert_pd_to_np(X, treatment, y) self.t_groups = np.unique(treatment[treatment != self.control_name]) self.t_groups.sort() check_p_conditions(p, self.t_groups) if isinstance(p, np.ndarray): treatment_name = self.t_groups[0] p = {treatment_name: convert_pd_to_np(p)} elif isinstance(p, dict): p = { treatment_name: convert_pd_to_np(_p) for treatment_name, _p in p.items() } ate = [] ate_lb = [] ate_ub = [] for i, group in enumerate(self.t_groups): logger.info('Estimating ATE for group {}.'.format(group)) w_group = (treatment == group).astype(int) p_group = p[group] if self.calibrate_propensity: logger.info('Calibrating propensity scores.') p_group = calibrate(p_group, w_group) yhat_c = np.zeros_like(y, dtype=float) yhat_t = np.zeros_like(y, dtype=float) if self.cv: for i_fold, (i_trn, i_val) in enumerate(self.cv.split(X, y), 1): logger.info( 'Training an outcome model for CV #{}'.format(i_fold)) self.model_tau.fit( np.hstack((X[i_trn], w_group[i_trn].reshape(-1, 1))), y[i_trn]) yhat_c[i_val] = self.model_tau.predict( np.hstack((X[i_val], np.zeros((len(i_val), 1))))) yhat_t[i_val] = self.model_tau.predict( np.hstack((X[i_val], np.ones((len(i_val), 1))))) else: self.model_tau.fit(np.hstack((X, w_group.reshape(-1, 1))), y) yhat_c = self.model_tau.predict( np.hstack((X, np.zeros((len(y), 1))))) yhat_t = self.model_tau.predict( np.hstack((X, np.ones((len(y), 1))))) if segment is None: logger.info('Training the TMLE learner.') _ate, se = simple_tmle(y, w_group, yhat_c, yhat_t, p_group) _ate_lb = _ate - se * norm.ppf(1 - self.ate_alpha / 2) _ate_ub = _ate + se * norm.ppf(1 - self.ate_alpha / 2) else: assert segment.shape[0] == X.shape[ 0] and segment.ndim == 1, 'Segment must be the 1-d np.array of int.' segments = np.unique(segment) _ate = [] _ate_lb = [] _ate_ub = [] for s in sorted(segments): logger.info( 'Training the TMLE learner for segment {}.'.format(s)) filt = (segment == s) & (yhat_c < np.quantile(yhat_c, q=.99)) _ate_s, se = simple_tmle(y[filt], w_group[filt], yhat_c[filt], yhat_t[filt], p_group[filt]) _ate_lb_s = _ate_s - se * norm.ppf(1 - self.ate_alpha / 2) _ate_ub_s = _ate_s + se * norm.ppf(1 - self.ate_alpha / 2) _ate.append(_ate_s) _ate_lb.append(_ate_lb_s) _ate_ub.append(_ate_ub_s) ate.append(_ate) ate_lb.append(_ate_lb) ate_ub.append(_ate_ub) return np.array(ate), np.array(ate_lb), np.array(ate_ub)
def mk_test(x, time, confidence_interval=False, alpha=0.05): """ This function is derived from code originally posted by Sat Kumar Tomer ([email protected]) See also: http://vsp.pnnl.gov/help/Vsample/Design_Trend_Mann_Kendall.htm The purpose of the Mann-Kendall (MK) test (Mann 1945, Kendall 1975, Gilbert 1987) is to statistically assess if there is a monotonic upward or downward trend of the variable of interest over time. A monotonic upward (downward) trend means that the variable consistently increases (decreases) through time, but the trend may or may not be linear. The MK test can be used in place of a parametric linear regression analysis, which can be used to test if the slope of the estimated linear regression line is different from zero. The regression analysis requires that the residuals from the fitted regression line be normally distributed; an assumption not required by the MK test, that is, the MK test is a non-parametric (distribution-free) test. Hirsch, Slack and Smith (1982, page 107) indicate that the MK test is best viewed as an exploratory analysis and is most appropriately used to identify stations where changes are significant or of large magnitude and to quantify these findings. Input: x: a vector of data alpha: significance level (0.05 default) Output: trend: tells the trend (increasing, decreasing or no trend) h: True (if trend is present) or False (if trend is absence) p: p value of the significance test z: normalized test statistics Examples -------- >>> x = np.random.rand(100) >>> trend,h,p,z = mk_test(x,0.05) """ n = len(x) # calculate S s = 0 N = int(n * (n - 1) / 2) q = np.zeros(N) ii = 0 for k in range(n - 1): for j in range(k + 1, n): s += np.sign(x[j] - x[k]) q[ii] = (x[j] - x[k]) / (time[j] - time[k]) ii = ii + 1 # calculate the unique data unique_x = np.unique(x) g = len(unique_x) sort_q = np.sort(q) slope = np.median(sort_q) # calculate the var(s) if n == g: # there is no tie var_s = (n * (n - 1) * (2 * n + 5)) / 18 else: # there are some ties in data tp = np.zeros(unique_x.shape) for i in range(len(unique_x)): tp[i] = sum(x == unique_x[i]) var_s = (n * (n - 1) * (2 * n + 5) - np.sum(tp * (tp - 1) * (2 * tp + 5))) / 18 if s > 0: z = (s - 1) / np.sqrt(var_s) elif s < 0: z = (s + 1) / np.sqrt(var_s) else: # s == 0: z = 0 # calculate the p_value p = 2 * (1 - norm.cdf(abs(z))) # two tail test h = abs(z) > norm.ppf(1 - alpha / 2) if (s < 0) and h: trend = 'decreasing' elif (s > 0) and h: trend = 'increasing' else: trend = 'no trend' # # confidence interva # stats = np.random.choice(sort_q, (len(sort_q), 100), replace=True) # p1 = ((1.0 - alpha)/2.0) * 100 # lower = max(0.0, np.percentile(stats, p1)) # p2 = (alpha+((1.0-alpha)/2.0)) * 100 # upper = min(1.0, np.percentile(stats, p2)) # print('%.1f confidence interval %.1f%% and %.1f%%' % (alpha*100, lower*100, upper*100)) std_conf = 0 if (confidence_interval == True): bias, std, rms, bias_conf, std_conf, rms_conf = bootstr_confidence( sort_q, 100) return trend, h, p, z, slope, std_conf
def fit_metric_model(self): logger.info("start computing metric model...") ### Load the results df_results = self.result_reader.load_all_results(aggregate=True) self._nb_models_done = len(df_results) if self._nb_models_done <= self.min_nb_of_models: return self if (self._nb_models_done is not None and len(df_results) == self._nb_models_done and self.params_training_columns is not None): return self ### Load the params df_params = self.result_reader.load_all_params() df_merged_result = pd.merge(df_params, df_results, how="inner", on="job_id") training_cols = diff(list(df_params.columns), ["job_id"]) # X dataframe for parameters dfX_params = df_merged_result.loc[:, training_cols] ### Retrive the target metric if self.avg_metrics: scorers = self.job_config.scoring else: scorers = [self.job_config.main_scorer ] # I'll use only the main_scorer N = dfX_params.shape[0] all_y_params = [] for scorer in scorers: y_params = df_merged_result["test_%s" % scorer] # Retrive the raw metric # replace NaN by scorer's observed minimum score ; if y_params contains # only NaN -> won't work y_params = y_params.fillna(y_params.min()).values if self.metric_transformation is None: pass elif self.metric_transformation == "rank": ### Transform in non-parametric rank .... y_params = kde_transfo_quantile(y_params) # => This behave likes a uniform law elif self.metric_transformation == "normal": ### Transform into non-parametric normal ... y_params = norm.ppf(kde_transfo_quantile(y_params)) # => This behaves likes a normal law elif self.metric_transformation == "default": ### Transform using default transformation (log like function) f = get_metric_default_transformation(scorer) y_params = f(y_params) if self.avg_metrics: # If I'm averaging I'd rather have something centered y_params = (y_params - np.mean(y_params)) / np.std(y_params) else: raise ValueError("I don't know this metric_transformation %s" % self.metric_transformation) all_y_params.append(y_params.reshape((N, 1))) if len(all_y_params) > 1: y_params = np.concatenate(all_y_params, axis=1).mean(axis=1) else: y_params = all_y_params[0].reshape((N, )) # elif self.metric_transformation # # # else: # # On peut aussi utiliser la transformation par default ? # scorer = self.job_config.main_scorer # y_params = df_merged_result["test_%s" % scorer].values # # create model transformer_model = GraphPipeline(models={ "encoder": NumericalEncoder(), "imputer": NumImputer() }, edges=[("encoder", "imputer")]) xx_params = transformer_model.fit_transform(dfX_params) random_forest = RandomForestRegressor(n_estimators=100, min_samples_leaf=5) random_forest.fit(xx_params, y_params) random_forest_variance = RandomForestVariance(random_forest) random_forest_variance.fit(xx_params, y_params) self.params_training_columns = training_cols self.transformer_model = transformer_model self.random_forest = random_forest self.random_forest_variance = random_forest_variance self._nb_models_done = len(df_results) logger.info("metric model fitted") return self
# X ~ N(mean, variance) ############################# from scipy.stats import norm mean = loc = 3 # loc std_variance = scale = 2 # scale x = 2.5 pdf_value = norm.pdf(x, loc, scale) print(f"When X ~ N({loc}, {scale}^2),\t pdf(X = {x}) = {pdf_value}") cdf_value = norm.cdf(x, loc, scale) print(f"When X ~ N({loc}, {scale}^2),\t cdf(X <= {x}) = {cdf_value}") # ppf: percentage point function (inverse function of cdf) p = 0.25 ppf_value = norm.ppf(p, loc, scale) print(f"When X ~ N({loc}, {scale}^2),\t ppf(p = {p}) = {ppf_value}") print(f"When X ~ N({loc}, {scale}^2),\t IQR = [{norm.ppf(0.25, loc, scale)}, {norm.ppf(0.75, loc, scale)}]") # rvs : random variates sample_size = 10 print(f"Random Variates (size :{sample_size}) from X ~ N({loc}, {scale}^2)\n", norm.rvs(loc,scale, size=sample_size)) print() #%% ############################# # Gamma Distribution # X ~ Gamma(k, theta) but in scipy, theta = 1 # f(x;k, theta) = x**(k-1) * exp(-x / theta) / theta ** k / gamma_function(k) ############################# from scipy.stats import gamma k = 1
def absPortf_HistVaR(listofInv=dfTime, numDays=2, MoneyVol=1000, startDt='22/11/2016', endDt='01/12/2016'): """ Variance-Covariance calculation of daily Value-at-Risk using confidence level c, with mean of returns mu and standard deviation of returns sigma, on a portfolio of value P. """ ##startDate = pd.to_datetime('22/11/2016' ) startDate = np.datetime64(datetime.datetime.strptime(startDt, '%d/%m/%Y')) endDate = np.datetime64(datetime.datetime.strptime(endDt, '%d/%m/%Y')) ##remove Timestamp, Trade Open, Trade Close listofInv = pd.DataFrame(listofInv.ix[1:, 0:]) listofInv['Date'] = pd.to_datetime(listofInv['Date']) listofInv = pd.DataFrame(listofInv) listofInv[(listofInv.Date >= startDate) & (listofInv.Date <= endDate)] set1 = pd.DataFrame() colListPrime = list(dfTime.columns.values) del colListPrime[0] colListPrime = colListPrime[::2] for i in range(1, len(colListFull), 2): set2 = (listofInv[listofInv.columns[i + 1]].astype(float)) - ( listofInv[listofInv.columns[i]].astype(float)) set1 = pd.concat([set1, set2], axis=1) del set2 set1.columns = [colListPrime] #print(set1.head()) SumSet = set1.groupby( (np.arange(len(set1.columns)) // len(set1.columns) * 10) + 1, axis=1).sum().add_prefix('sum') SumSet["rets"] = SumSet["sum1"].pct_change() SumSet.replace(np.NaN, 0, inplace=True) SumSet = SumSet.replace([np.inf, -np.inf], 0) SumSet = SumSet.astype(float) mu = np.mean(SumSet["rets"]) sigma = np.std(SumSet["rets"]) print(mu) print(sigma) #print "Value-at-Risk: $%0.2f" % var valueAtRisk_95 = MoneyVol - MoneyVol * (norm.ppf(0.05, mu, sigma) + 1) valueAtRisk_99 = MoneyVol - MoneyVol * (norm.ppf(0.01, mu, sigma) + 1) #Portf_stddev = (sum1+2*sum2) ** (0.5) print("Portfolio Historical Value at Risk with 95% confidence is: " + str(valueAtRisk_95)) print("Portfolio Historical Value at Risk with 99% confidence is: " + str(valueAtRisk_99))
def calculate_z_score(self): return norm.ppf(random.random())
x_test_encoded = encoder.predict(x_test, batch_size=batch_size) plt.figure(figsize=(6, 6)) plt.scatter(x_test_encoded[:, 0], x_test_encoded[:, 1], c=y_test_) plt.colorbar() # plt.show() plt.savefig('x_test_encoded.jpg') # 构建生成器 decoder_input = Input(shape=(latent_dim, )) _h_decoded = decoder_h(decoder_input) _x_decoded_mean = decoder_mean(_h_decoded) generator = Model(decoder_input, _x_decoded_mean) # 观察隐变量的两个维度变化是如何影响输出结果的 n = 15 # figure with 15x15 digits digit_size = 28 figure = np.zeros((digit_size * n, digit_size * n)) #用正态分布的分位数来构建隐变量对 grid_x = norm.ppf(np.linspace(0.05, 0.95, n)) grid_y = norm.ppf(np.linspace(0.05, 0.95, n)) for i, yi in enumerate(grid_x): for j, xi in enumerate(grid_y): z_sample = np.array([[xi, yi]]) x_decoded = generator.predict(z_sample) digit = x_decoded[0].reshape(digit_size, digit_size) figure[i * digit_size:(i + 1) * digit_size, j * digit_size:(j + 1) * digit_size] = digit plt.figure(figsize=(10, 10)) plt.imshow(figure, cmap='Greys_r') # plt.show() plt.savefig('Greys_r.jpg')
def absPortf_VC_spec_VaR(listofInv=dfTMP, numDays=2, MoneyVol=1000, startDt='2016-11-25', endDt='2016-11-30'): """ Variance-Covariance Method of computing VaR of specific Stocks(separately, not as a Portfolio) over time frame. """ #print(listofInv.head(9)) colListFull = list(listofInv.columns.values) colListPrime = list(listofInv.columns.values) listRisk_95 = [] listRisk_99 = [] ##startDate = pd.to_datetime('22/11/2016' ) startDate = np.datetime64(startDt) endDate = np.datetime64(endDt) ##remove Timestamp, Trade Open, Trade Close listofInv = pd.DataFrame(listofInv.ix[1:, 0:]) listofInv = pd.DataFrame(listofInv) #listofInv['Date'] = [time.date() for time in listofInv['Date']] listofInv['Date'] = pd.to_datetime(listofInv['Date'], format='%Y-%m-%d') listofInv['Date'] = pd.to_datetime(listofInv['Date']) mask = (listofInv['Date'] >= startDate) & (listofInv['Date'] <= endDate) #print (mask) listofInv = listofInv.loc[mask] del colListFull[0] del colListPrime[0] #drop column from the Data frame listofInv = listofInv.drop('Date', 1) colListPrime = colListPrime[::2] #print(listofInv.head(10)) #print(len(colListFull)) for i in range(1, len(colListFull), 2): ##changed to 1 from 2 #print("i= " + str(i)) #print("NOW col" + str(listofInv.ix[1:,(i):(i+1)].head(8))) tmp = pd.DataFrame(listofInv.ix[1:, (i):(i + 1)]) tmp = tmp[pd.notnull(tmp[tmp.columns[(0)]])] #print("curr col name is :" + tmp.columns[0]) ##tmp.columns[0] #print(tmp.head(7)) tmp["rets" + str(tmp.columns[0])] = tmp[tmp.columns[0]].pct_change() mu = np.mean(tmp["rets" + str(tmp.columns[0])]) sigma = np.std(tmp["rets" + str(tmp.columns[0])]) #print(tmp.head(7)) #print(mu) #print(sigma) valueAtRisk_95 = MoneyVol - MoneyVol * (norm.ppf(0.05, mu, sigma) + 1) valueAtRisk_99 = MoneyVol - MoneyVol * (norm.ppf(0.01, mu, sigma) + 1) listRisk_95.append(valueAtRisk_95) listRisk_99.append(valueAtRisk_99) #valueAtRisk_95 = MoneyVol - MoneyVol*(norm.ppf(0.05, mu, sigma) + 1) #valueAtRisk_99 = MoneyVol - MoneyVol*(norm.ppf(0.01, mu, sigma) + 1) #Portf_stddev = (sum1+2*sum2) ** (0.5) #print("Portfolio Historical Value at Risk with 95% confidence is: " + str(valueAtRisk_95)) #print("Portfolio Historical Value at Risk with 99% confidence is: " + str(valueAtRisk_99)) del tmp ##set1.columns = [colListPrime] ##print(set1.head()) #print("VaR for a particular Investments:" + str(listRisk_95)) setRisk_95 = pd.DataFrame(listRisk_95).T setRisk_99 = pd.DataFrame(listRisk_99).T setRisk_95.columns = [colListPrime] setRisk_99.columns = [colListPrime] setRisk_95 = setRisk_95**2 setRisk_99 = setRisk_99**2 #print(setRisk_95) setRisk_95['FINAL_95'] = setRisk_95.groupby( (np.arange(len(setRisk_95.columns)) // len(setRisk_95.columns) * 10) + 1, axis=1).sum().add_prefix('sum') setRisk_95['FINAL_95'] = setRisk_95['FINAL_95']**0.5 setRisk_99['FINAL_99'] = setRisk_99.groupby( (np.arange(len(setRisk_99.columns)) // len(setRisk_99.columns) * 10) + 1, axis=1).sum().add_prefix('sum') setRisk_99['FINAL_99'] = setRisk_99['FINAL_99']**0.5 setPortfVaR = pd.DataFrame() print(setRisk_95) print(setRisk_99)
def interval_arcsin(x, alpha): u = np.abs(norm.ppf(alpha / 2)) return u * np.sqrt(variance_of_arcsin(x))
def estimate_ate(self, X, p, treatment, y, bootstrap_ci=False, n_bootstraps=1000, bootstrap_size=10000): """Estimate the Average Treatment Effect (ATE). Args: X (np.matrix): a feature matrix p (np.ndarray or dict): an array of propensity scores of float (0,1) in the single-treatment case or, a dictionary of treatment groups that map to propensity vectors of float (0,1) treatment (np.array): a treatment vector y (np.array): an outcome vector bootstrap_ci (bool): whether run bootstrap for confidence intervals n_bootstraps (int): number of bootstrap iterations bootstrap_size (int): number of samples per bootstrap verbose (str): whether to output progress logs Returns: The mean and confidence interval (LB, UB) of the ATE estimate. """ te = self.fit_predict(X, p, treatment, y) check_p_conditions(p, self.t_groups) if isinstance(p, np.ndarray): treatment_name = self.t_groups[0] p = {treatment_name: p} ate = np.zeros(self.t_groups.shape[0]) ate_lb = np.zeros(self.t_groups.shape[0]) ate_ub = np.zeros(self.t_groups.shape[0]) for i, group in enumerate(self.t_groups): w = (treatment == group).astype(int) prob_treatment = float(sum(w)) / X.shape[0] _ate = te[:, i].mean() se = (np.sqrt((self.vars_t[group] / prob_treatment) + (self.vars_c[group] / (1 - prob_treatment)) + te[:, i].var()) / X.shape[0]) _ate_lb = _ate - se * norm.ppf(1 - self.ate_alpha / 2) _ate_ub = _ate + se * norm.ppf(1 - self.ate_alpha / 2) ate[i] = _ate ate_lb[i] = _ate_lb ate_ub[i] = _ate_ub if not bootstrap_ci: return ate, ate_lb, ate_ub else: t_groups_global = self.t_groups _classes_global = self._classes model_mu_global = deepcopy(self.model_mu) models_tau_global = deepcopy(self.models_tau) logger.info('Bootstrap Confidence Intervals for ATE') ate_bootstraps = np.zeros(shape=(self.t_groups.shape[0], n_bootstraps)) for n in tqdm(range(n_bootstraps)): cate_b = self.bootstrap(X, p, treatment, y, size=bootstrap_size) ate_bootstraps[:, n] = cate_b.mean() ate_lower = np.percentile(ate_bootstraps, (self.ate_alpha / 2) * 100, axis=1) ate_upper = np.percentile(ate_bootstraps, (1 - self.ate_alpha / 2) * 100, axis=1) # set member variables back to global (currently last bootstrapped outcome) self.t_groups = t_groups_global self._classes = _classes_global self.model_mu = deepcopy(model_mu_global) self.models_tau = deepcopy(models_tau_global) return ate, ate_lower, ate_upper
def main(): data = bmnist()[:2] # ignore test split model = VAE(z_dim=ARGS.zdim) print('VAE parameter count:', sum(p.numel() for p in model.parameters())) model = model.to(device) optimizer = torch.optim.Adam(model.parameters(), lr=5e-4) writer = SummaryWriter('logs/log1') train_curve, val_curve = [], [] for epoch in range(ARGS.epochs): #""" elbos = run_epoch(model, data, optimizer, writer) train_elbo, val_elbo = elbos writer.add_scalars('data/elbos', { 'train elbo': train_elbo.item(), 'val elbo': val_elbo.item() }, epoch) train_curve.append(train_elbo) val_curve.append(val_elbo) print(f"[Epoch {epoch}] train elbo: {train_elbo} val_elbo: {val_elbo}") # """ # -------------------------------------------------------------------- # Add functionality to plot samples from model during training. # You can use the make_grid functioanlity that is already imported. # -------------------------------------------------------------------- if epoch == 36: torch.save(model.state_dict(), 'manifoldstate' + str(ARGS.zdim) + '.pt') #model.load_state_dict(torch.load('modelstate/modelstate30.pt')) #model.eval() model_im = model.sample(9)[0] im_grid = make_grid(model_im, nrow=3) writer.add_image('data/DecoIm', im_grid, epoch) #plt.imshow(im_grid.permute(1, 2, 0)) #plt.axis('off') #plt.savefig('VAEsample' + str(epoch) + '.png') #plt.close() # -------------------------------------------------------------------- # Add functionality to plot the learned data manifold after # if required (i.e., if zdim == 2). You can use the make_grid # functionality that is already imported. # -------------------------------------------------------------------- if ARGS.zdim == 2: x = torch.linspace(norm.ppf(0.1), norm.ppf(0.9), 10) xx, xy = torch.meshgrid(x, x) z_mesh = torch.stack([xx, xy], 0) z_mesh = z_mesh.view(2, -1).t() model_bern = model.sample(1, z_mesh)[1] im_grid = make_grid(model_bern, nrow=10) writer.add_image('data/ManifoldIm', im_grid, epoch) #plt.imshow(im_grid.permute(1, 2, 0)) #plt.axis('off') #plt.savefig('VAEmanifold.png') save_elbo_plot(train_curve, val_curve, 'elbo.pdf') writer.export_scalars_to_json("./all_scalars.json") writer.close()
model.default_var_location = genn_wrapper.VarLocation_DEVICE model.default_sparse_connectivity_location = genn_wrapper.VarLocation_DEVICE lif_init = { "V": genn_model.init_var("Normal", { "mean": -58.0, "sd": 5.0 }), "RefracTime": 0.0 } poisson_init = {"current": 0.0} exp_curr_params = {"tau": 0.5} quantile = 0.9999 normal_quantile_cdf = norm.ppf(quantile) max_delay = { pop: MEAN_DELAY[pop] + (DELAY_SD[pop] * normal_quantile_cdf) for pop in POPULATION_NAMES } print("Max excitatory delay:%fms , max inhibitory delay:%fms" % (max_delay["E"], max_delay["I"])) # Calculate maximum dendritic delay slots # **NOTE** it seems inefficient using maximum for all but this allows more aggressive merging of postsynaptic models max_dendritic_delay_slots = int(round(max(itervalues(max_delay)) / DT_MS)) print("Max dendritic delay slots:%d" % max_dendritic_delay_slots) print("Creating neuron populations:") total_neurons = 0 neuron_populations = {}
def compute_association(target, features, function=information_coefficient, dropna='all', target_ascending=False, features_ascending=False, n_jobs=1, min_n_per_job=100, n_features=0.95, n_samplings=30, confidence=0.95, n_permutations=30, random_seed=RANDOM_SEED, filepath=None): """ Compute: score_i = function(target, feature_i) for all features. Compute confidence interval (CI) for n_features features. Compute P-value and FDR (BH) for all features. :param target: Series; (n_samples); must have name and indices, matching features's column index :param features: DataFrame; (n_features, n_samples); must have row and column indices :param function: function; scoring function :param dropna: str; 'any' or 'all' :param target_ascending: bool; target is ascending or not :param n_jobs: int; number of jobs to parallelize :param min_n_per_job: int; minimum number of n per job :param features_ascending: bool; True if features scores increase from top to bottom, and False otherwise :param n_features: int or float; number of features to compute confidence interval and plot; number threshold if >= 1, percentile threshold if < 1, and don't compute if None :param n_samplings: int; number of bootstrap samplings to build distribution to get CI; must be > 2 to compute CI :param confidence: float; fraction compute confidence interval :param n_permutations: int; number of permutations for permutation test to compute P-val and FDR :param random_seed: int; :param filepath: str; :return: Series, DataFrame, DataFrame; (n_features, 8 ('score', '<confidence> moe', 'p-value (forward)', 'p-value (reverse)', 'p-value', 'fdr (forward)', 'fdr (reverse)', 'fdr')) """ # TODO: make empty DataFrame to absorb the results instead of concatenation # Make sure target is a Series and features a DataFrame # Keep samples found in both target and features # Drop features with less than 2 unique values target, features = _preprocess_target_and_features( target, features, dropna=dropna, target_ascending=target_ascending) results = DataFrame(index=features.index, columns=[ 'score', '{} moe'.format(confidence), 'p-value (forward)', 'p-value (reverse)', 'p-value', 'fdr (forward)', 'fdr (reverse)', 'fdr' ]) # # Compute: score_i = function(target, feature_i) # print_log('Scoring (n_jobs={}) ...'.format(n_jobs)) # Split features for parallel computing if features.shape[0] < n_jobs * min_n_per_job: n_jobs = 1 split_features = split_dataframe(features, n_jobs) # Score # scores = _score((target,features,function)) # scores = concat(parallelize(_score, [(target, f, function) for f in split_features], n_jobs), verify_integrity=True) # Load scores and sort results by scores results.ix[scores.index, 'score'] = scores results.sort_values('score', ascending=features_ascending, inplace=True) # # Compute CI using bootstrapped distribution # if n_samplings < 2: print_log('Not computing CI because n_samplings < 2.') elif ceil(0.632 * features.shape[1]) < 3: print_log('Not computing CI because 0.632 * n_samples < 3.') else: print_log( 'Computing {} CI for using distributions built by {} bootstraps ...' .format(confidence, n_samplings)) indices_to_bootstrap = get_top_and_bottom_indices( results, 'score', n_features) # Bootstrap: for n_sampling times, randomly choose 63.2% of the samples, score, and build score distribution sampled_scores = DataFrame(index=indices_to_bootstrap, columns=range(n_samplings)) seed(random_seed) for c_i in sampled_scores: # Random sample ramdom_samples = choice(features.columns.tolist(), int(ceil(0.632 * features.shape[1]))).tolist() sampled_target = target.ix[ramdom_samples] sampled_features = features.ix[indices_to_bootstrap, ramdom_samples] rs = get_state() # Score sampled_scores.ix[:, c_i] = sampled_features.apply( lambda f: function(sampled_target, f), axis=1) set_state(rs) # Compute scores' confidence intervals using bootstrapped score distributions # TODO: improve confidence interval calculation z_critical = norm.ppf(q=confidence) # Load confidence interval results.ix[sampled_scores.index, '{} moe'.format(confidence)] = sampled_scores.apply( lambda f: z_critical * (f.std() / sqrt(n_samplings)), axis=1) # # Compute P-values and FDRs by sores against permuted target # if n_permutations < 1: print_log('Not computing P-value and FDR because n_perm < 1.') else: print_log( 'Computing P-value & FDR by scoring against {} permuted targets (n_jobs={}) ...' .format(n_permutations, n_jobs)) # Permute and score permutation_scores = concat( parallelize(_permute_and_score, [(target, f, function, n_permutations, random_seed) for f in split_features], n_jobs), verify_integrity=True) print_log('\tComputing P-value and FDR ...') # All scores all_permutation_scores = permutation_scores.values.flatten() for i, (r_i, r) in enumerate(results.iterrows()): # This feature's score s = r.ix['score'] # Compute forward P-value p_value_forward = (all_permutation_scores >= s).sum() / len(all_permutation_scores) if not p_value_forward: p_value_forward = float(1 / len(all_permutation_scores)) results.ix[r_i, 'p-value (forward)'] = p_value_forward # Compute reverse P-value p_value_reverse = (all_permutation_scores <= s).sum() / len(all_permutation_scores) if not p_value_reverse: p_value_reverse = float(1 / len(all_permutation_scores)) results.ix[r_i, 'p-value (reverse)'] = p_value_reverse # Compute forward FDR results.ix[:, 'fdr (forward)'] = multipletests( results.ix[:, 'p-value (forward)'], method='fdr_bh')[1] # Compute reverse FDR results.ix[:, 'fdr (reverse)'] = multipletests( results.ix[:, 'p-value (reverse)'], method='fdr_bh')[1] # Creating the summary P-value and FDR forward = results.ix[:, 'score'] >= 0 results.ix[:, 'p-value'] = concat([ results.ix[forward, 'p-value (forward)'], results.ix[~forward, 'p-value (reverse)'] ]) results.ix[:, 'fdr'] = concat([ results.ix[forward, 'fdr (forward)'], results.ix[~forward, 'fdr (reverse)'] ]) # Save if filepath: establish_filepath(filepath) results.to_csv(filepath, sep='\t') return target, features, results
def nn_trainer(train_mark, model, train_data, test_conv_X, test_data_X, test_data_Y, trainer_params_list, ctx): """Parsing the params list""" ### The data batch_size = trainer_params_list['batch_size'] epochs = trainer_params_list['epoch_num'] loss_func = Gaussian_loss initializer = trainer_params_list['initializer'] optimizer = trainer_params_list['optimizer'] optimizer_params = trainer_params_list['optimizer_params'] #train_iter = gluon.data.DataLoader(train_data, batch_size, shuffle=True) ### The model mx.random.seed(123456) model.collect_params().initialize(initializer, ctx=ctx) trainer = gluon.Trainer(model.collect_params(), optimizer=optimizer, optimizer_params=optimizer_params) n_train = len(train_data) n_test = len(test_data_Y) ### The training process for e in range(epochs): start = time.time() train_loss = 0 k = 0 train_iter = gluon.data.DataLoader(train_data, batch_size, shuffle=True) for conv_data, data, label in train_iter: label = label.as_in_context(ctx) with autograd.record(): output_mu, output_sigma = model(data, conv_data) loss = loss_func(output_mu, output_sigma, label) loss.backward() trainer.step(1, ignore_stale_grad=True) train_loss += nd.sum(loss).asscalar() k += 1 if k * batch_size > n_train * 0.3: print('training_data_nb:', k * batch_size) break ### The test loss valid_mu, valid_sigma = DLPred2(model, test_data_X, test_conv_X) valid_loss = loss_func(valid_mu, (valid_sigma), (test_data_Y)).asscalar() #rho50 = rho_risk(0,7, valid_mu.asnumpy(), test_data_Y.asnumpy(), 0.5) avg_rho50 = avg_rho_risk(valid_mu.asnumpy(), test_data_Y.asnumpy(), 0.5, 7) valid_pred90 = norm.ppf(0.9, valid_mu.asnumpy(), valid_sigma.asnumpy()) # print(valid_mu[0:5,:]) # print( valid_sigma[0:5,:]) # print(valid_pred90[0:5,:]) # rho90 = rho_risk(0,7, valid_pred90, test_data_Y.asnumpy(), 0.9) avg_rho90 = avg_rho_risk(valid_pred90, test_data_Y.asnumpy(), 0.9, 7) #print("Epoch %d, valid loss: %f rho50: %f, rho90 %f" % (e, valid_loss, rho50,rho90)) print("Epoch %d, valid loss: %f avg_rho50: %f, avg_rho90 %f" % (e, valid_loss, avg_rho50, avg_rho90)) end = time.time() print('total_time:', end - start)
from scipy.stats import norm class Generator(object): def __init__(self, input_dim, output_dim): self.input_dim = input_dim self.output_dim = output_dim def __call__(self, z): return z * 2 + 3 generator = Generator(1, 1) prior = NormalPrior() kernel = ParsenDensityEstimator() model = ais.Model(generator, prior, kernel, 0.25, 10000) p = norm() x = np.linspace(norm.ppf(0.01, loc=3, scale=2), norm.ppf(0.99, loc=3, scale=2), 100) p1 = norm.pdf(x, loc=3, scale=2) xx = np.reshape(x, [100, 1]) schedule = ais.get_schedule(100, rad=4) print(schedule) p2 = np.exp(model.ais(xx, schedule)) plt.plot(x, p1) plt.plot(x, p2) plt.show()
""" Name : c12_16_VaR_baesd_on_normality.py Book : Python for Finance (2nd ed.) Publisher: Packt Publishing Ltd. Author : Yuxing Yan Date : 6/6/2017 email : [email protected] [email protected] """ import numpy as np import pandas as pd from scipy.stats import norm from matplotlib.finance import quotes_historical_yahoo_ochl as getData # ticker = 'WMT' # input 1 n_shares = 500 # input 2 confidence_level = 0.99 # input 3 begdate = (2012, 1, 1) # input 4 enddate = (2016, 12, 31) # input 5 # z = norm.ppf(1 - confidence_level) x = getData(ticker, begdate, enddate, asobject=True, adjusted=True) ret = x.aclose[1:] / x.aclose[:-1] - 1 # position = n_shares * x.close[0] mean = np.mean(ret) std = np.std(ret) # VaR = position * (mean + z * std) print("Holding=", position, "VaR=", round(VaR, 4), "tomorrow")
from scipy.stats import norm import numpy as np import matplotlib.pyplot as plt mu = 10 sigma = 2 x = np.arange(norm.ppf(0.01,loc=mu,scale=sigma), norm.ppf(0.99,loc=mu, scale=sigma), 0.1) print(x) fig, [axpdf, axcdf, axhisto] = plt.subplots(1,3) randVect = norm.rvs(loc=mu, scale=sigma, size=1000) axpdf.plot(x,norm.pdf(x,mu), 'r-',label='PDF') axpdf.legend(loc='best') axcdf.plot(x,norm.cdf(x,mu), 'r-', label='CDF') axcdf.legend(loc='best') axhisto.hist(randVect, color='0.75', label='Normally distributed values') axhisto.legend(loc='best') fig.tight_layout() plt.show()
def __call__(self, sampled_parameters, loglikelihood, start_param_vec, ns_boundary, **kwargs): """Run the sampler. Args: sampled_parameters (:obj:`list` of :obj:`gleipnir.sampled_parameter.SampledParameter`): The parameters that are being sampled. loglikelihood (function): The log likelihood function. start_param_vec (obj:`numpy.ndarray`): The starting position of parameter vector for the parameters being sampled. ns_boundary (float): The current lower likelihood bound from the Nested Sampling routine. kwargs (dict): Pass in any other method specific keyword arguments. """ if self._first: self._ndim = len(sampled_parameters) for sampled_parameter in sampled_parameters: rs = sampled_parameter.rvs(100) mirs = np.min(rs) mars = np.max(rs) width = mars - mirs #print(width) self._widths.append(0.5*width) #steps.append(0.5*width) self._widths = np.array(self._widths) self._first = False start_likelihood = loglikelihood(start_param_vec) # Tuning cycles steps = self._widths.copy() acceptance = np.zeros(self._ndim) cur_point = start_param_vec.copy() cur_likelihood = start_likelihood for i in range(self.tuning_cycles): for k in range(20): rsteps = np.random.random(self._ndim) u = np.random.random(self._ndim) for j in range(self._ndim): new_point = cur_point.copy() cur_pointj = cur_point[j] widthj = self._widths[j] # Generate the appropriate proposal distribution if self.proposal == 'normal': new_pointj = norm.ppf(rsteps[j],loc=cur_pointj, scale=widthj) else: new_pointj = uniform.ppf(rsteps[j],loc=cur_pointj-(widthj/2.0), scale=widthj) new_point[j] = new_pointj cur_priorj = sampled_parameters[j].prior(cur_pointj) new_priorj = sampled_parameters[j].prior(new_point[j]) ratio = new_priorj/cur_priorj new_likelihood = loglikelihood(new_point) # Metropolis criterion with NS boundary if (u[j] < ratio) and (new_likelihood > ns_boundary): # accept the new point and update cur_point[j] = new_pointj cur_likelihood = new_likelihood acceptance[j] += 1.0 # Adjust the step sizes acceptance_ratio = acceptance/20.0 less_than_mask = acceptance_ratio < 0.2 gt_mask = acceptance_ratio > 0.6 steps[less_than_mask] *= 0.66 steps[gt_mask] *= 1.33 acceptance[:] = 0.0 # Start the sampling chain self._widths = steps.copy() cur_point = start_param_vec.copy() # curr_likelihood = start_likelihood for i in range(self.iterations+self.burn_in): rsteps = np.random.random(self._ndim) u = np.random.random(self._ndim) for j in range(self._ndim): new_point = cur_point.copy() cur_pointj = cur_point[j] widthj = self._widths[j] # Generate the appropriate proposal distribution if self.proposal == 'normal': new_pointj = norm.ppf(rsteps[j],loc=cur_pointj, scale=widthj) else: new_pointj = uniform.ppf(rsteps[j],loc=cur_pointj-(widthj/2.0), scale=widthj) cur_priorj = sampled_parameters[j].prior(cur_pointj) new_priorj = sampled_parameters[j].prior(new_point[j]) ratio = new_priorj/cur_priorj #print("ratio",ratio, "cur_priorj", cur_priorj, "new_priorj", new_priorj, "cur_pointj", cur_pointj, "new_pointj", new_pointj, "rstepj", rsteps[j]) new_likelihood = loglikelihood(new_point) # Metropolis criterion with NS boundary if (u[j] < ratio) and (new_likelihood > ns_boundary): # accept the new point and update cur_point[j] = new_pointj cur_likelihood = new_likelihood return cur_point, cur_likelihood
np.arange(0.2, 2.0, 0.01), np.arange(2.0, 10.1, 0.1))) #list of viewing angles thetas = np.linspace(0, 180, 100) #SNR of 1, 2, 3, 4, 5 confs = [norm.cdf(sn) for sn in limSNs] print "Confidence levels:", confs print "Sigma levels:", limSNs print "Trial a13s:", a13s print "Trial thetas:", thetas #for each confidence interval for n, conf in enumerate(confs): #array to hold percent of viewing angles ruled out for each a13 at this conf outangles = np.zeros(len(a13s)) #sigma needed to establish confidence below LC sig = norm.ppf(conf) #for each a13 model for j, a13 in enumerate(a13s): print "Testing model at a13:", a13 #boolean mask for whether angle is ruled out to given confidence mask = np.array([True] * len(thetas)) #for each band for i in range(len(t)): #generate theoretical light curve Lk = np.array([ KasenFit(ti, a13, 1.0, wave_0[bands[band[i]]], m_c, e_51, z, 0) for ti in t[i] ]) #compare to observed for each viewing angle for k, theta in enumerate(thetas):
encoder = Model(x, z_mu) # display a 2D plot of the digit classes in the latent space z_test = encoder.predict(x_test, batch_size=batch_size) plt.figure(figsize=(6, 6)) plt.scatter(z_test[:, 0], z_test[:, 1], c=y_test, alpha=.4, s=3**2, cmap='viridis') plt.colorbar() plt.show() # display a 2D manifold of the digits n = 15 # figure with 15x15 digits digit_size = 28 # linearly spaced coordinates on the unit square were transformed # through the inverse CDF (ppf) of the Gaussian to produce values # of the latent variables z, since the prior of the latent space # is Gaussian u_grid = np.dstack( np.meshgrid(np.linspace(0.05, 0.95, n), np.linspace(0.05, 0.95, n))) z_grid = norm.ppf(u_grid) x_decoded = decoder.predict(z_grid.reshape(n * n, 2)) x_decoded = x_decoded.reshape(n, n, digit_size, digit_size) plt.figure(figsize=(10, 10)) plt.imshow(np.block(list(map(list, x_decoded))), cmap='gray') plt.show()
def chamberlain(n, q, alpha=.05): return norm.ppf(1 - alpha / 2) * np.sqrt(q*(1 - q) / n)
sigma220 = (220**0.5) * sigma print('The probability of dropping over 40% in 220 days is ', norm.cdf(-0.4, mu220, sigma220)) # In[7]: #Probability mu220 = 220 * mu sigma220 = (220**0.5) * sigma drop20 = norm.cdf(-0.2, mu220, sigma220) print('The probability of dropping over 20% in 220 days is ', drop20) # In[8]: #Value at Risk VaR = norm.ppf(0.05, mu, sigma) print('Single day Value at Risk ', VaR) # In[9]: #Value at Risk print('5% quantile ', norm.ppf(0.05, mu, sigma)) print('95%quantile', norm.ppf(0.95, mu, sigma)) # In[10]: #Value at Risk q25 = norm.ppf(0.25, mu, sigma) print('25% quantile', q25) q75 = norm.ppf(0.75, mu, sigma) print('75% quantile', q75)
def fit(self, q=.5, vcov='robust', kernel='epa', bandwidth='hsheather', max_iter=1000, p_tol=1e-6, **kwargs): '''Solve by Iterative Weighted Least Squares Parameters ---------- q : float Quantile must be between 0 and 1 vcov : string, method used to calculate the variance-covariance matrix of the parameters. Default is ``robust``: - robust : heteroskedasticity robust standard errors (as suggested in Greene 6th edition) - iid : iid errors (as in Stata 12) kernel : string, kernel to use in the kernel density estimation for the asymptotic covariance matrix: - epa: Epanechnikov - cos: Cosine - gau: Gaussian - par: Parzene bandwidth: string, Bandwidth selection method in kernel density estimation for asymptotic covariance estimate (full references in QuantReg docstring): - hsheather: Hall-Sheather (1988) - bofinger: Bofinger (1975) - chamberlain: Chamberlain (1994) ''' if q < 0 or q > 1: raise Exception('p must be between 0 and 1') kern_names = ['biw', 'cos', 'epa', 'gau', 'par'] if kernel not in kern_names: raise Exception("kernel must be one of " + ', '.join(kern_names)) else: kernel = kernels[kernel] if bandwidth == 'hsheather': bandwidth = hall_sheather elif bandwidth == 'bofinger': bandwidth = bofinger elif bandwidth == 'chamberlain': bandwidth = chamberlain else: raise Exception("bandwidth must be in 'hsheather', 'bofinger', 'chamberlain'") endog = self.endog exog = self.exog nobs = self.nobs exog_rank = np_matrix_rank(self.exog) self.rank = exog_rank self.df_model = float(self.rank - self.k_constant) self.df_resid = self.nobs - self.rank n_iter = 0 xstar = exog beta = np.ones(exog_rank) # TODO: better start, initial beta is used only for convergence check # Note the following doesn't work yet, # the iteration loop always starts with OLS as initial beta # if start_params is not None: # if len(start_params) != rank: # raise ValueError('start_params has wrong length') # beta = start_params # else: # # start with OLS # beta = np.dot(np.linalg.pinv(exog), endog) diff = 10 cycle = False history = dict(params = [], mse=[]) while n_iter < max_iter and diff > p_tol and not cycle: n_iter += 1 beta0 = beta xtx = np.dot(xstar.T, exog) xty = np.dot(xstar.T, endog) beta = np.dot(pinv(xtx), xty) resid = endog - np.dot(exog, beta) mask = np.abs(resid) < .000001 resid[mask] = np.sign(resid[mask]) * .000001 resid = np.where(resid < 0, q * resid, (1-q) * resid) resid = np.abs(resid) xstar = exog / resid[:, np.newaxis] diff = np.max(np.abs(beta - beta0)) history['params'].append(beta) history['mse'].append(np.mean(resid*resid)) if (n_iter >= 300) and (n_iter % 100 == 0): # check for convergence circle, shouldn't happen for ii in range(2, 10): if np.all(beta == history['params'][-ii]): cycle = True break warnings.warn("Convergence cycle detected", ConvergenceWarning) if n_iter == max_iter: warnings.warn("Maximum number of iterations (1000) reached.", IterationLimitWarning) e = endog - np.dot(exog, beta) # Greene (2008, p.407) writes that Stata 6 uses this bandwidth: # h = 0.9 * np.std(e) / (nobs**0.2) # Instead, we calculate bandwidth as in Stata 12 iqre = stats.scoreatpercentile(e, 75) - stats.scoreatpercentile(e, 25) h = bandwidth(nobs, q) h = min(np.std(endog), iqre / 1.34) * (norm.ppf(q + h) - norm.ppf(q - h)) fhat0 = 1. / (nobs * h) * np.sum(kernel(e / h)) if vcov == 'robust': d = np.where(e > 0, (q/fhat0)**2, ((1-q)/fhat0)**2) xtxi = pinv(np.dot(exog.T, exog)) xtdx = np.dot(exog.T * d[np.newaxis, :], exog) vcov = chain_dot(xtxi, xtdx, xtxi) elif vcov == 'iid': vcov = (1. / fhat0)**2 * q * (1 - q) * pinv(np.dot(exog.T, exog)) else: raise Exception("vcov must be 'robust' or 'iid'") lfit = QuantRegResults(self, beta, normalized_cov_params=vcov) lfit.q = q lfit.iterations = n_iter lfit.sparsity = 1. / fhat0 lfit.bandwidth = h lfit.history = history return RegressionResultsWrapper(lfit)
def analyze(problem, Y, calc_second_order=True, num_resamples=100, conf_level=0.95, print_to_console=False, parallel=False, n_processors=None, seed=None): """Perform Sobol Analysis on model outputs. Returns a dictionary with keys 'S1', 'S1_conf', 'ST', and 'ST_conf', where each entry is a list of size D (the number of parameters) containing the indices in the same order as the parameter file. If calc_second_order is True, the dictionary also contains keys 'S2' and 'S2_conf'. Parameters ---------- problem : dict The problem definition Y : numpy.array A NumPy array containing the model outputs calc_second_order : bool Calculate second-order sensitivities (default True) num_resamples : int The number of resamples (default 100) conf_level : float The confidence interval level (default 0.95) print_to_console : bool Print results directly to console (default False) References ---------- .. [1] Sobol, I. M. (2001). "Global sensitivity indices for nonlinear mathematical models and their Monte Carlo estimates." Mathematics and Computers in Simulation, 55(1-3):271-280, doi:10.1016/S0378-4754(00)00270-6. .. [2] Saltelli, A. (2002). "Making best use of model evaluations to compute sensitivity indices." Computer Physics Communications, 145(2):280-297, doi:10.1016/S0010-4655(02)00280-1. .. [3] Saltelli, A., P. Annoni, I. Azzini, F. Campolongo, M. Ratto, and S. Tarantola (2010). "Variance based sensitivity analysis of model output. Design and estimator for the total sensitivity index." Computer Physics Communications, 181(2):259-270, doi:10.1016/j.cpc.2009.09.018. Examples -------- >>> X = saltelli.sample(problem, 1000) >>> Y = Ishigami.evaluate(X) >>> Si = sobol.analyze(problem, Y, print_to_console=True) """ if seed: np.random.seed(seed) # determining if groups are defined and adjusting the number # of rows in the cross-sampled matrix accordingly if not problem.get('groups'): D = problem['num_vars'] else: D = len(set(problem['groups'])) if calc_second_order and Y.size % (2 * D + 2) == 0: N = int(Y.size / (2 * D + 2)) elif not calc_second_order and Y.size % (D + 2) == 0: N = int(Y.size / (D + 2)) else: raise RuntimeError(""" Incorrect number of samples in model output file. Confirm that calc_second_order matches option used during sampling.""") if conf_level < 0 or conf_level > 1: raise RuntimeError("Confidence level must be between 0-1.") # normalize the model output Y = (Y - Y.mean()) / Y.std() A, B, AB, BA = separate_output_values(Y, D, N, calc_second_order) r = np.random.randint(N, size=(N, num_resamples)) Z = norm.ppf(0.5 + conf_level / 2) if not parallel: S = create_Si_dict(D, calc_second_order) for j in range(D): S['S1'][j] = first_order(A, AB[:, j], B) S['S1_conf'][j] = Z * first_order(A[r], AB[r, j], B[r]).std(ddof=1) S['ST'][j] = total_order(A, AB[:, j], B) S['ST_conf'][j] = Z * total_order(A[r], AB[r, j], B[r]).std(ddof=1) # Second order (+conf.) if calc_second_order: for j in range(D): for k in range(j + 1, D): S['S2'][j, k] = second_order(A, AB[:, j], AB[:, k], BA[:, j], B) S['S2_conf'][j, k] = Z * second_order( A[r], AB[r, j], AB[r, k], BA[r, j], B[r]).std(ddof=1) else: tasks, n_processors = create_task_list(D, calc_second_order, n_processors) func = partial(sobol_parallel, Z, A, AB, BA, B, r) pool = Pool(n_processors) S_list = pool.map_async(func, tasks) pool.close() pool.join() S = Si_list_to_dict(S_list.get(), D, calc_second_order) # Print results to console if print_to_console: print_indices(S, problem, calc_second_order) # Add problem context and override conversion method for special case S.problem = problem S.to_df = MethodType(to_df, S) return S