def get_profit_probability(self, x_vector, y_vector, iv, s, r, t): ''' Returns the probability of obtaining a profit with the strategy with in current scenario under study inputs: x_vector -> vector of underlying prices y_vector -> vector with Black-Scholes results iv -> underlying implied volatility s -> current underlying price ''' p_profit = 0 # Calculate break-even points zero_crossings = np.where(np.diff(np.sign(y_vector)))[0] breakevens = [x_vector[i] for i in zero_crossings] if len(breakevens) > 2: print 'ERROR: more than 2 zeroes detected' elif len(breakevens) == 0: p_profit = (0.9999 if y_vector[(len(y_vector)/2)] > 0 else 0.0001) else: # Get probability of being below the min breakeven at expiration # REVIEW CDF can't return zero! scale = s * np.exp(r * t) p_below = lognorm.cdf(breakevens[0], iv, scale=scale) # Get probability of being above the max breakeven at expiration p_above = lognorm.sf(breakevens[1], iv, scale=scale) # Get the probability of profit for the calendar p_profit = 1 - p_above - p_below print('Profit prob. with s=' + str(s) + ', iv=' + str(iv) + ', b/e=' + str(breakevens)) print('1 - ' + str(p_below) + ' - ' + str(p_above) + ' = ' + str(p_profit)) # TODO debugging purposes return p_profit
def multivariate_normality(X, alpha=.05): """Henze-Zirkler multivariate normality test. Parameters ---------- X : np.array Data matrix of shape (n_samples, n_features). alpha : float Significance level. Returns ------- hz : float The Henze-Zirkler test statistic. pval : float P-value. normal : boolean True if X comes from a multivariate normal distribution. See Also -------- normality : Test the univariate normality of one or more variables. homoscedasticity : Test equality of variance. sphericity : Mauchly's test for sphericity. Notes ----- The Henze-Zirkler test [1]_ has a good overall power against alternatives to normality and works for any dimension and sample size. Adapted to Python from a Matlab code [2]_ by Antonio Trujillo-Ortiz and tested against the `MVN <https://cran.r-project.org/web/packages/MVN/MVN.pdf>`_ R package. Rows with missing values are automatically removed. References ---------- .. [1] Henze, N., & Zirkler, B. (1990). A class of invariant consistent tests for multivariate normality. Communications in Statistics-Theory and Methods, 19(10), 3595-3617. .. [2] Trujillo-Ortiz, A., R. Hernandez-Walls, K. Barba-Rojo and L. Cupul-Magana. (2007). HZmvntest: Henze-Zirkler's Multivariate Normality Test. A MATLAB file. Examples -------- >>> import pingouin as pg >>> data = pg.read_dataset('multivariate') >>> X = data[['Fever', 'Pressure', 'Aches']] >>> pg.multivariate_normality(X, alpha=.05) HZResults(hz=0.5400861018514641, pval=0.7173686509624891, normal=True) """ from scipy.stats import lognorm # Check input and remove missing values X = np.asarray(X) assert X.ndim == 2, 'X must be of shape (n_samples, n_features).' X = X[~np.isnan(X).any(axis=1)] n, p = X.shape assert n >= 3, 'X must have at least 3 rows.' assert p >= 2, 'X must have at least two columns.' # Covariance matrix S = np.cov(X, rowvar=False, bias=True) S_inv = np.linalg.pinv(S).astype(X.dtype) # Preserving original dtype difT = X - X.mean(0) # Squared-Mahalanobis distances Dj = np.diag(np.linalg.multi_dot([difT, S_inv, difT.T])) Y = np.linalg.multi_dot([X, S_inv, X.T]) Djk = -2 * Y.T + np.repeat(np.diag(Y.T), n).reshape(n, -1) + \ np.tile(np.diag(Y.T), (n, 1)) # Smoothing parameter b = 1 / (np.sqrt(2)) * ((2 * p + 1) / 4)**(1 / (p + 4)) * \ (n**(1 / (p + 4))) # Is matrix full-rank (columns are linearly independent)? if np.linalg.matrix_rank(S) == p: hz = n * (1 / (n**2) * np.sum(np.sum(np.exp(-(b**2) / 2 * Djk))) - 2 * ((1 + (b**2))**(-p / 2)) * (1 / n) * (np.sum(np.exp(-((b**2) / (2 * (1 + (b**2)))) * Dj))) + ((1 + (2 * (b**2)))**(-p / 2))) else: hz = n * 4 wb = (1 + b**2) * (1 + 3 * b**2) a = 1 + 2 * b**2 # Mean and variance mu = 1 - a**(-p / 2) * (1 + p * b**2 / a + (p * (p + 2) * (b**4)) / (2 * a**2)) si2 = 2 * (1 + 4 * b**2)**(-p / 2) + 2 * a**(-p) * \ (1 + (2 * p * b**4) / a**2 + (3 * p * (p + 2) * b**8) / (4 * a**4)) \ - 4 * wb**(-p / 2) * (1 + (3 * p * b**4) / (2 * wb) + (p * (p + 2) * b**8) / (2 * wb**2)) # Lognormal mean and variance pmu = np.log(np.sqrt(mu**4 / (si2 + mu**2))) psi = np.sqrt(np.log((si2 + mu**2) / mu**2)) # P-value pval = lognorm.sf(hz, psi, scale=np.exp(pmu)) normal = True if pval > alpha else False HZResults = namedtuple('HZResults', ['hz', 'pval', 'normal']) return HZResults(hz=hz, pval=pval, normal=normal)
label='2: $\mu$ and $\sigma$ estimated by MLE') plt.legend(loc='upper right') # In[74]: #1D log_lik_h0 = log_lik_lognorm(data, mu, sigma) log_lik_mle = log_lik_lognorm(data, mu_MLE, sig_MLE) LR_val = 2 * (log_lik_mle - log_lik_h0) pval_h0 = 1.0 - sts.chi2.cdf(LR_val, 2) print(pval_h0) # In[75]: # 1E print(lognorm.sf(100000, sig_MLE, loc=mu_MLE, scale=np.exp(mu_MLE))) print(lognorm.cdf(75000, sig_MLE, loc=mu_MLE, scale=np.exp(mu_MLE))) # In[85]: # 2A import pandas as pd data = pd.read_csv('sick.txt', header=0) def norm_pdf(xvals, mu, sigma): pdf_vals = (1 / (sigma * np.sqrt(2 * np.pi)) * np.exp(-(xvals - mu)**2 / (2 * sigma**2))) return pdf_vals
def multivariate_normality(X, alpha=.05): """Henze-Zirkler multivariate normality test. Parameters ---------- X : np.array Data matrix of shape (n, p) where n are the observations and p the variables. alpha : float Significance level. Returns ------- normal : boolean True if X comes from a multivariate normal distribution. p : float P-value. See Also -------- normality : Test the univariate normality of one or more variables. homoscedasticity : Test equality of variance. sphericity : Mauchly's test for sphericity. Notes ----- The Henze-Zirkler test has a good overall power against alternatives to normality and is feasable for any dimension and any sample size. Aapted to Python from a Matlab code by Antonio Trujillo-Ortiz. Tested against the R package MVN. References ---------- .. [1] Henze, N., & Zirkler, B. (1990). A class of invariant consistent tests for multivariate normality. Communications in Statistics-Theory and Methods, 19(10), 3595-3617. .. [2] Trujillo-Ortiz, A., R. Hernandez-Walls, K. Barba-Rojo and L. Cupul-Magana. (2007). HZmvntest: Henze-Zirkler's Multivariate Normality Test. A MATLAB file. Examples -------- 1. Test for multivariate normality of 2 variables >>> import numpy as np >>> from pingouin import multivariate_normality >>> np.random.seed(123) >>> mean, cov, n = [4, 6], [[1, .5], [.5, 1]], 30 >>> X = np.random.multivariate_normal(mean, cov, n) >>> normal, p = multivariate_normality(X, alpha=.05) >>> print(normal, p) True 0.7523511059223078 2. Test for multivariate normality of 3 variables >>> import numpy as np >>> from pingouin import multivariate_normality >>> np.random.seed(123) >>> mean, cov = [4, 6, 5], [[1, .5, .2], [.5, 1, .1], [.2, .1, 1]] >>> X = np.random.multivariate_normal(mean, cov, 50) >>> normal, p = multivariate_normality(X, alpha=.05) >>> print(normal, p) True 0.46074660317578175 """ from scipy.stats import lognorm # Check input X = np.asarray(X) assert X.ndim == 2 n, p = X.shape assert p >= 2 # Covariance matrix S = np.cov(X, rowvar=False, bias=True) S_inv = np.linalg.inv(S) difT = X - X.mean(0) # Squared-Mahalanobis distances Dj = np.diag(np.linalg.multi_dot([difT, S_inv, difT.T])) Y = np.linalg.multi_dot([X, S_inv, X.T]) Djk = -2 * Y.T + np.repeat(np.diag(Y.T), n).reshape(n, -1) + \ np.tile(np.diag(Y.T), (n, 1)) # Smoothing parameter b = 1 / (np.sqrt(2)) * ((2 * p + 1) / 4)**(1 / (p + 4)) * \ (n**(1 / (p + 4))) if np.linalg.matrix_rank(S) == p: hz = n * (1 / (n**2) * np.sum(np.sum(np.exp(-(b**2) / 2 * Djk))) - 2 * ((1 + (b**2))**(-p / 2)) * (1 / n) * (np.sum(np.exp(-((b**2) / (2 * (1 + (b**2)))) * Dj))) + ((1 + (2 * (b**2)))**(-p / 2))) else: hz = n * 4 wb = (1 + b**2) * (1 + 3 * b**2) a = 1 + 2 * b**2 # Mean and variance mu = 1 - a**(-p / 2) * (1 + p * b**2 / a + (p * (p + 2) * (b**4)) / (2 * a**2)) si2 = 2 * (1 + 4 * b**2)**(-p / 2) + 2 * a**(-p) * \ (1 + (2 * p * b**4) / a**2 + (3 * p * (p + 2) * b**8) / (4 * a**4)) \ - 4 * wb**(-p / 2) * (1 + (3 * p * b**4) / (2 * wb) + (p * (p + 2) * b**8) / (2 * wb**2)) # Lognormal mean and variance pmu = np.log(np.sqrt(mu**4 / (si2 + mu**2))) psi = np.sqrt(np.log((si2 + mu**2) / mu**2)) # P-value pval = lognorm.sf(hz, psi, scale=np.exp(pmu)) normal = True if pval > alpha else False return normal, pval
def hazard_pdf(self, t): return lognorm.pdf(t, self.sigma, 0, np.exp(self.mu)) / lognorm.sf( t, self.sigma, 0, np.exp(self.mu))
def estimate(self, sampler): # A wireless interface can increase or decrease its line rate # so the line rate is checked regularly for WiFi. if (self.interface_type == InterfaceType.Wireless): dummy,self.linerate = self.get_linerate_wireless(sampler.get_interface()) t = time.time() est_timer = t - self.time_of_last_calc self.time_of_last_calc = t # self.request_queue.put('r') self.request_queue.put('rate_data') rate_data = self.sample_queue.get() self.sample_queue.task_done() tx_agg = rate_data['tx_agg'] rx_agg = rate_data['rx_agg'] samples = rate_data['samples'] txb2 = rate_data['txb2'] rxb2 = rate_data['rxb2'] n = samples - self.last_samples # Approximately kbytes/sec, but not really since we have a # measurement jitter of the number of samples recorded in each # sampling period. (Usually, by default ms). (The sampling # often cannot keep up). self.mean_tx = (tx_agg - self.last_tx_agg) / n self.mean_rx = (rx_agg - self.last_rx_agg) / n mean_square_tx = self.mean_tx*self.mean_tx mean_square_rx = self.mean_rx*self.mean_rx sum_square_tx = (txb2 - self.last_txb2) / n sum_square_rx = (rxb2 - self.last_rxb2) / n # NOTE: Rounding to 5 decimals is perhaps correct if we get # negative variance due to the measurement jitter. # It is not clear why we get a measurement jitter, so why this # is necessary is a somewhat of a mystery. self.var_tx = sum_square_tx - mean_square_tx if self.var_tx < 0: if self.display_data: print("\33[9;1H") # print("\33[0J") print("WARNING: self.var_tx == " + str(self.var_tx)) self.var_tx = round(sum_square_tx - mean_square_tx,5) # round to avoid negative value self.var_rx = sum_square_rx - mean_square_rx if self.var_rx < 0: if self.display_data: print("\33[10;1H") # print("\33[0J") print("WARNING: self.var_rx == " + str(self.var_rx)) self.var_rx = round(sum_square_rx - mean_square_rx,5) # round to avoid negative value if self.debug and False: print("\33[12;1H") print("\33[0J################### DEBUG ##################") print("\33[0Jest_timer: %f"%est_timer) print("\33[0Jself.mean_tx: %f self.mean_rx: %f"%(self.mean_tx,self.mean_rx)) print("\33[0Jtxb2: %f rxb2 %f"%(txb2,rxb2)) print("\33[0Jself.last_txb2 %f self.last_rxb2 %f"%(self.last_txb2,self.last_rxb2)) print("\33[0Jmean_square_tx %f mean_square_rx %f"%((mean_square_tx),(mean_square_rx))) print("\33[0Jsum_square_tx %f sum_square_rx %f"%(sum_square_tx,sum_square_rx)) print("\33[0Jself.var tx: %f self.var_rx: %f"%(self.var_tx,self.var_rx)) self.last_samples = samples self.last_tx_agg = tx_agg self.last_rx_agg = rx_agg self.last_txb2 = txb2 self.last_rxb2 = rxb2 # Estimate the moments try: if self.mean_tx != 0.0: self.sigma2_tx = math.log(1.0+(self.var_tx/mean_square_tx)) self.mu_tx = math.log(self.mean_tx) - (self.sigma2_tx/2.0) else: # self.sigma2_tx = float('nan') self.sigma2_tx = 0.0 self.mu_tx = 0.0 if self.mean_rx != 0.0: self.sigma2_rx = math.log(1.0+(self.var_rx/(mean_square_rx))) self.mu_rx = math.log(self.mean_rx) - (self.sigma2_rx/2.0) else: # self.sigma2_rx = float('nan') self.sigma2_rx = 0.0 self.mu_rx = 0.0 # Calculate the overload risk ## Based on the original code, using the CDF (Cumulative Distribution Function). # self.overload_risk_tx = (1-lognorm.cdf(self.linerate * self.cutoff,math.sqrt(self.sigma2_tx),0,math.exp(self.mu_tx)))*100 # self.overload_risk_rx = (1-lognorm.cdf(self.linerate * self.cutoff,math.sqrt(self.sigma2_rx),0,math.exp(self.mu_rx)))*100 ## Using the survival function (1 - cdf). See http://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.lognorm.html for a motivation). self.overload_risk_tx = (lognorm.sf(self.linerate * self.cutoff,math.sqrt(self.sigma2_tx),0,math.exp(self.mu_tx)))*100 self.overload_risk_rx = (lognorm.sf(self.linerate * self.cutoff,math.sqrt(self.sigma2_rx),0,math.exp(self.mu_rx)))*100 ### According to our dicussion, using the PPF (Percentile Point Function (or Quantile function). # self.cutoff_rate_tx = (1-lognorm.ppf( self.cutoff,math.sqrt(self.sigma2_tx),0,math.exp(self.mu_tx))) # self.cutoff_rate_rx = (1-lognorm.ppf( self.cutoff,math.sqrt(self.sigma2_rx),0,math.exp(self.mu_rx))) # To estimate a risk: compare the calculated cutoff rate with the nominal line rate. except ValueError as ve: if self.display_data: print("\33[2K") print("Error in estimation: ({}):".format(ve)) traceback.print_exc() if self.display_data: print("\33[2K") print("mean_tx: %.2e, mean_rx: %.2e "%(self.mean_tx,self.mean_rx)) if self.display_data: print("\33[2K") print("var_tx: %.2e, var_rx: %.2e "%(self.var_tx,self.var_rx)) if self.display_data: print("\33[2K") print("mean_square_tx: %.2e, mean_square_rx: %.2e "%(mean_square_tx,mean_square_rx)) if self.display_data: print("\33[2K") print("rate_data: %s"%(rate_data,)) exit(1) if self.display_data: try: print("\33[H",end="") # move cursor home # [PD] 2016-05-23, The calculation of "actual" seems to be buggy. # print("\33[2KEstimate (sample_rate: {:d} actual({:d}), interface: {}, linerate: {:d}".format(sampler.get_sample_rate(), n, sampler.get_interface(),self.linerate)) print("\33[2Ksample_rate (/s): {:d}, interface: {}, linerate (bytes/s): {:d}, link speed (Mbit/s): {:d}".format(sampler.get_sample_rate(), sampler.get_interface(),self.linerate,self.link_speed)) print("\33[2KTX(mean: %.2e b/s std: %.2e mu: %.2e s2: %.2e, ol-risk: %.2e) "%(self.mean_tx,math.sqrt(self.var_tx),self.mu_tx,self.sigma2_tx, self.overload_risk_tx)) print("\33[2KRX(mean: %.2e b/s std: %.2e mu: %.2e s2: %.2e, ol-risk: %.2e) "%(self.mean_rx,math.sqrt(self.var_rx),self.mu_rx,self.sigma2_rx, self.overload_risk_rx)) print("\33[2Kestimation timer: {:.4f}".format(est_timer)) print("\33[2Kestimation interval: {:.2f}".format(self.est_interval)) print("\33[2Kmeter interval: %d"%(self.meter_interval)) print("\33[2Kmode: %d"%(self.mode)) if self.debug: print("\33[2Kdebug: %s"%str(self.debug)) print("\33[2Ksample_queue size: %s"%str(self.sample_queue.qsize())) except ValueError as ve: print("\33[2KError in display ({}):".format(ve)) traceback.print_exc() print("\33[2Kvar_tx: %.2e, var_rx: %.2e "%(self.var_tx,self.var_rx)) print("\33[2Krate_data: %s"%(rate_data,)) exit(1) # FIXME: It should not be necessary to empty the queue here # anymore, since the monitor code only puts stuff in the Queue # on request. # Verify this before remove this while loop! while not self.sample_queue.empty(): self.sample_queue.get() self.sample_queue.task_done()
current_price = None if not args.current_price: # TODO Get current underlying price from Yahoo or similar pass else: current_price = args.current_price current_iv = args.iv if len(breakevens) > 2: print 'ERROR: more than 2 zeroes detected' else: # Get probability of underlying being below first zero # REVIEW CDF can't return zero! scale = current_price * np.exp(r * t) p_below = lognorm.cdf(breakevens[0], current_iv, scale=scale) # Get probability of underlying being above second zero p_above = lognorm.sf(breakevens[1], current_iv, scale=scale) # Get the probability of profit for the calendar p_profit = 1 - p_above - p_below print('Probabilities (below, above, profit): ' + str(p_below) + ' - ' + str(p_above) + ' - ' + str(p_profit)) calls = df[df['m_right'] == 'C'] plot_calendars(calls, near_term, next_term, current_price) sys.exit() # TODO remove to go on ''' # Do all the possible combinations of expiries for calendar spreads expiry_combinations = list(combinations(expiries, r=2)) # Iterate calls for (near_term, next_term) in expiry_combinations: if near_term > next_term:
plt.savefig('fig_1c') plt.close() # 1d log_lik_h0 = log_lik_lognorm(incomes_array, mu_1b, sigma_1b) log_lik_mle = log_lik_lognorm(incomes_array, mu_mle, sigma_mle) LR_val = 2 * (log_lik_mle - log_lik_h0) pval_h0 = 1.0 - sts.chi2.cdf(LR_val, 2) print('1d: p-value from the chi-square test: {:.3f}\n'.format(pval_h0)) # 1e print('1e: Probability of earning more than $100,000: {:.4f}'.format( lognorm.sf(100000, sigma_mle, loc=mu_mle, scale=np.exp(mu_mle)))) print(' Probability of earning less than $75,000: {:.4f}\n'.format( lognorm.cdf(75000, sigma_mle, loc=mu_mle, scale=np.exp(mu_mle)))) # Q2 sick_df = pd.read_csv('sick.txt') sick_df.rename(columns={'\ufeffsick': 'sick'}, inplace=True) def log_like_sick(sick_df, b0, b1, b2, b3, sigma): error = sick_df.sick - b0 - b1*sick_df.age - b2*sick_df.children - b3*\ sick_df.avgtemp_winter pdf_vals = norm_pdf(error, 0, sigma) log_lik_val = np.log(pdf_vals).sum()