def getCI_t(self): ''' the t-method confidence bounds, accurator than earlier methods ''' SE = np.std(self.btheta, ddof = 1) self.CI_t = (self.theta - SE*t.ppf(0.975, self.n - 1), self.theta - SE*t.ppf(0.025, self.n - 1)) return self.CI_t
def ppf(self, arg): """Inverse cumulative density function (ICDF). Parameters ---------- arg : array Grid of point to evaluate ICDF at. Must belong to (0, 1) Returns ------- array ICDF values. Same shape as the input. """ arg = np.atleast_1d(arg) a = self.__const_a() b = self.__const_b() cond = arg < (1-self.lam)/2 ppf1 = t.ppf(arg / (1-self.lam), self.eta) ppf2 = t.ppf(.5 + (arg - (1-self.lam)/2) / (1+self.lam), self.eta) ppf = -999.99*np.ones_like(arg) ppf = np.nan_to_num(ppf1) * cond \ + np.nan_to_num(ppf2) * np.logical_not(cond) ppf = (ppf * (1+np.sign(arg-(1-self.lam)/2)*self.lam) \ * (1-2/self.eta)**.5 - a)/b if ppf.shape == (1, ): return float(ppf) else: return ppf
def GeneratePDF(Data, method = 'Robust_Student_t', lower_threshold = 0.15, upper_threshold = 0.85): '''Generate the pdf estimate of the data Input: /Data/ data to estimate pdf on /method/ Method of estimation. Available methods: 'Robust_Student_t'; 'KDE'; 'Normal' /lower_threshold/ in percentage /upper_threshold/ in percentage Output: /pdf/ fitted pdf /cdf/ fitted cdf ''' x = np.linspace(min(Data), max(Data), 100) if method == 'Robust_Student_t': nu, mu, sigma = uvtfit(Data) pdf = t.pdf(x, nu, mu, sigma) cdf = t.cdf(x, nu, mu, sigma) lower = t.ppf(lower_threshold, nu, mu, sigma) upper = t.ppf(upper_threshold, nu, mu, sigma) elif method == 'Normal': mu, sigma = norm.fit(Data) pdf = norm.pdf(x, mu, sigma) cdf = norm.cdf(x, mu, sigma) lower = norm.ppf(lower_threshold, mu, sigma) upper = norm.ppf(upper_threshold, mu, sigma) elif method == 'KDE': kernal = gaussian_kde(Data) pdf = kernal.evaluate(x) cdf = np.array([kernal.integrate_box(x[0], x[i+1]) for i in range(len(x)-1)]) lower = np.percentile(cdf, lower_threshold*100) upper = np.percentile(cdf, upper_threshold*100) return x, pdf, cdf, lower, upper
def solve(self): df = 51-1 statistic = 1.1/(4.9/51**0.5) statistic = round(statistic,2) a = t.ppf(0.025, df) b = t.ppf(0.975, df) test = False if statistic>=a and statistic<=b: test = True return [df,statistic,test]
def solve(self): df = 20-1 statistic = (4.6-5)/(2.2/20**0.5) statistic = round(statistic,2) a = t.ppf(0.025, df) b = t.ppf(0.975, df) test = False if statistic>=a and statistic<=b: test = True return [df,statistic,test]
def evaluateLogLikelihoodHessian(self, par): print(par) df = par[0] self.par[1] = par[1] self.par[2] = par[2] self.par[3] = par[3] # Extract the degrees of freedom and the dimension p = (self.uhat).shape[1] n = (self.uhat).shape[0] self.constructCorrelationMatrix(p) # Compute the percentile function on univariate t tppf_uhat = t.ppf(self.uhat, df) # Calculate the first part of the log-likelihood part1 = 0 for ii in range(n): part1 += multiTLogPDF(tppf_uhat[ii, :], np.zeros(p), self.P, df, p) # Calculate the second part of the log-likelihood part2 = np.sum(t.logpdf(tppf_uhat, df)) return part1 - part2
def solve(self): x = round((18.985+21.015)/2,2) n = 36 df = n-1 s = -(21.015-18.985)/2 * np.sqrt(n) / t.ppf(0.025, df) s = round(s,2) return [x,s]
def solve(self): de=t.ppf(0.05,50) result=(1.1-0)/(4.9/(np.sqrt(51))) if de<=result: return [round(50,2),round(result,2),True] else: return [round(50,2),round(result,2),False]
def getConfidenceIntervals(variance_type, groups): """ Expects a dictionary of endpoint groups and the endpoint variance-type. Appends results to the dictionary for each endpoint-group. Confidence interval calculated using a two-tailed t-test, assuming 95% confidence interval. """ for grp in groups: lower_ci = grp.get('lower_ci') upper_ci = grp.get('upper_ci') n = grp.get('n') if ( lower_ci is None and upper_ci is None and n is not None and grp['estimate'] is not None and grp['variance'] is not None ): est = grp['estimate'] var = grp['variance'] z = t.ppf(0.975, max(n-1, 1)) change = None if variance_type == 'SD': change = z * var / math.sqrt(n) elif variance_type in ('SE', 'SEM'): change = z * var if change is not None: lower_ci = round(est - change, 2) upper_ci = round(est + change, 2) grp.update(lower_ci=lower_ci, upper_ci=upper_ci)
def solve(self): de=t.ppf(0.05,19) result=(4.6-5)/(2.2/np.sqrt(20)) if de<=result : return [round(19,2),round(result,2),True] else: return [round(19,2),round(result,2),False]
def calc_stats(amostra): # confidence interval of 95% tdist = t.ppf(0.95, len(amostra)-1) mean = numpy.mean(amostra) std = numpy.std(amostra) error = tdist*(std/math.sqrt(len(amostra))) return mean, std, error
def mu_intervall(sample, var, gamma): """ calcuates the confidence intervall for the mean of a population. Parameters ========== sample: array sample data var: float variance of sample. 0 if not known. will be calculated by an estimator gamme: float confidence Returns ======= value : tuple (a,b) confidence intervall as a tuple """ s_mean = np.array(sample).mean() if var == 0: std = _sample_std(sample) q = t.ppf((1 + gamma) / 2.0, len(sample) - 1) else: std = np.sqrt(var) q = norm.ppf((1 + gamma) / 2.0) c = q * std / np.sqrt(len(sample)) c1 = s_mean - c c2 = s_mean + c return (c1, c2)
def different_stdev(self, alpha): t0 = (self.y1 - self.y2) / (np.sqrt(self.S1**2/self.n1 + self.S2**2/self.n2)) # hypothesis testing2 n1, n2, y1, y2, S1, S2 = self.n1, self.n2, self.y1, self.y2, self.S1, self.S2 df = int((S1**2/n1+S2**2/n2)**2/((S1**2/n1)**2/(n1-1)+(S2**2/n2)**2/(n2-1))) H1a = t.ppf(1 - alpha/2., df) < np.abs(t0) H1b = t.ppf(1 - alpha, df) < t0 H1c = t.ppf(alpha, df) > t0 # p-value p1a = t.sf(np.abs(t0), df) * 2 p1b = t.sf(t0, df) p1c = t.cdf(t0, df) c1 = y1 - y2 - t.ppf(1 - alpha/2., df) * np.sqrt(S1**2/n1+S2**2/n2) c2 = y1 - y2 + t.ppf(1 - alpha/2., df) * np.sqrt(S1**2/n1+S2**2/n2) return H1a, H1b, H1c, p1a, p1b, p1c, (c1,c2)
def solve(self): upper = 21.015 lower = 18.985 de = t.ppf(0.025, 35) mean = (upper + lower) / 2 s = np.sqrt(36) * (upper - lower) / (2 * (-de)) return [round(mean, 2), round(s, 2)]
def t_test(df, modelDir, start_date, confidence=0.99, model="neuralNet"): """Is given a dataframe of demand, temp, and dates (in that order)""" from scipy.stats import t df = df.copy() if model == "neuralNet": df["dates"] = pd.date_range(start_date, freq="H", periods = df.shape[0]) df.columns = ["load", "tempc", "dates"] all_X = omf.loadForecast.makeUsefulDf(df) actual = df["load"].values pred, acc = omf.loadForecast.neural_net_predictions(all_X, actual) if model == "nextDayPeakKatrina": ppt, pred, act_time, actual = omf.loadForecast.nextDayPeakKatrinaForecast( df.values, start_date, modelDir, {}, returnActuals=True ) diff = [p - a for p, a in zip(pred, actual[-8760:])] diff = np.asarray(diff) alpha = 1 - confidence twosigma = -1 * t.ppf(alpha / 2, len(diff)) * np.std(diff) diff = np.abs(diff) diff = diff > twosigma if model == "neuralNet": return diff, actual[-8760:], pred, pred - twosigma, pred + twosigma if model == "nextDayPeakKatrina": return diff, actual, act_time
def conf_calc(x, y_err, c_limit=0.975): ''' Calculates confidence interval of regression between x and y Parameters ---------- x: 1D numpy array y_err: 1D numpy array of residuals (y - fit) c_limit: (optional) float number representing the area to the left of the critical value in the t-statistic table eg: for a 2 tailed 95% confidence interval (the default) c_limit = 0.975 Returns ------- confs: 1D numpy array of predicted y values for x inputs ''' # Define the variables you need # to calculate the confidence interval mean_x = np.mean(x) # mean of x n = len(x) # number of samples in origional fit tstat = t.ppf(c_limit, n-1) # appropriate t value s_err = np.sum(np.power(y_err,2)) # sum of the squares of the residuals # create series of new test x-values to predict for p_x = np.linspace(np.min(x),np.max(x),50) confs = tstat * np.sqrt((s_err/(n-2))*(1.0/n + (np.power((p_x-mean_x),2)/ ((np.sum(np.power(x,2)))-n*(np.power(mean_x,2)))))) return p_x, confs
def t_student(n, alfa): ''' Calcula o t_{alfa/2,n} da distribuicao t-student. ''' from scipy.stats import t return t.ppf(1 - 1.0 * alfa / 2, n)
def _t(u, rho, nu): """ Generates values of the T copula Inputs: u -- u is an N-by-P matrix of values in [0,1], representing N points in the P-dimensional unit hypercube. rho -- a P-by-P correlation matrix. nu -- degrees of freedom for T Copula Outputs: y -- the value of the T Copula """ n = u.shape[0] p = u.shape[1] loIntegrationVal = -40 lo = np.full((1,p), loIntegrationVal) # more accuracy, but slower :/ hi = t.ppf(u, nu) mu = np.zeros(p) y = np.zeros(n) for ii in np.arange(n): x = hi[ii,:] x[x<-40] = -40 p = mvt.mvstdtprob(lo[0], x, rho, nu) y[ii] = p return y
def predicate(cls, tasks, user_id, cost): if len(tasks) < 3: return None, None, None, None, None # use only same user tasks? same_user_tasks = filter_user_id(tasks, user_id) if len(same_user_tasks) > 3: tasks = same_user_tasks # use only same cost tasks? same_cost_tasks = filter_cost(tasks, cost) if len(same_cost_tasks) > 3: tasks = same_cost_tasks # use only last N tasks tasks = tasks[-8:] sample = np.array([x['actualWorkTime'] / x['cost'] for x in tasks]) n = sample.size mu = np.mean(sample) s2 = np.var(sample, ddof=1) t45 = sci_t.ppf(0.95, n - 1) mlow, mhigh = mu + np.array([-t45, t45]) * (np.sqrt(s2) / np.sqrt(n)) chi45a = sci_chi2.ppf(0.95, n - 1) shigh = np.sqrt((n - 1) * s2 / chi45a) low, high = mlow - shigh, mhigh + shigh return (mlow + mhigh) / 2 * cost, mlow * cost, mhigh * cost, low * cost, high * cost
def _interval(self, X, alpha, pred): """ Helper for computing prediction/confidence intervals. """ # Comments from QR decomposition solution to Ax = y: # # Rather than A'A we have R from the QR decomposition of A, but # R'R equals A'A. Note that R is not upper triangular since we # have already multiplied it by the permutation matrix, but it # is invertible. Rather than forming the product R'R which is # ill-conditioned, we can rewrite x' inv(A'A) x as the equivalent # x' inv(R) inv(R') x = t t', for t = x' inv(R) # # We have since switched to an SVD solver, which gives us # # invC = A' A = (USV')' USV' = VSU' USV' = V S S V' # C = inv(A'A) = inv(VSSV') = inv(V') inv(S S) inv(V) # = V inv(S S) V' = V inv(S) inv(S) V' # # Substituting, we get # # x' inv(A'A) x = t t', for t = x' V inv(S) # # Since x is a vector, t t' is the inner product sum(t**2). # Note that LAPACK allows us to do this simultaneously for many # different x using sqrt(sum(T**2,axis=1)), with T = X' Vinv(S). # # Note: sqrt(F(1-a;1,df)) = T(1-a/2;df) # from scipy.stats import t # lazy import in case scipy not present y = np.dot(X, self.x).ravel() s = t.ppf(1-alpha/2, self.DoF) * self.rnorm/np.sqrt(self.DoF) t = np.dot(X, self._SVinv) dy = s * np.sqrt(pred + np.sum(t**2, axis=1)) return y, dy
def accept(self): self.con = float(self.con_edit.text()) first_data = [] second_data = [] samples = self.appropriate[self.currentGroup] group_values, counts = self.dataset.GetNumericValues(self.currentVar) for i in range(len(group_values)): if self.dataset.GetValue(self.currentGroup, i+1) == samples[0]: element = group_values[i] first_data.append(element) elif self.dataset.GetValue(self.currentGroup, i+1) == samples[1]: element = group_values[i] second_data.append(element) self.t_score, self.pvalue = ttest_ind(first_data, second_data, equal_var=self.equal_variances) mean1 = sum(first_data)/len(first_data) mean2 = sum(second_data)/len(second_data) self.means = {samples[0]:mean1, samples[1]:mean2} if len(first_data) < len(second_data): self.df = len(first_data)-1 else: self.df = len(second_data)-1 if self.radio_noteq.isChecked(): pass elif self.radio_greater.isChecked(): self.pvalue /= 2 elif self.radio_less.isChecked(): self.pvalue /= 2 self.P_obs = t.ppf(1-self.con, self.df)
def solve(self): de = t.ppf(0.05, 24) result = (7.73 - 8) / (0.77 / np.sqrt(25)) if de <= result: return [24, round(result, 2), True] else: return [24, round(result, 2), False]
def sampling_distribution(): fig, ax = plt.subplots(1, 1) #display the probability density function df = 10 x=np.linspace(t.ppf(0.01, df), t.ppf(0.99, df), 100) ax.plot(x, t.pdf(x, df)) #simulate the sampling distribution y = [] for i in range(1000): r = norm.rvs(loc=5, scale=2, size=df+1) rt =(np.mean(r)-5)/np.sqrt(np.var(r)/df) y.append(rt) ax.hist(y, normed=True, alpha=0.2) plt.savefig('sampling_distribution.png')
def confidence_int(self, conf_level=95): """ Calculate confidence interval of the mean time measured Parameters ---------- conf_level: float confidence level desired for the confidence interval in percent. this will be transformed into the quantile needed to get the z value for the t distribution. default is 95% confidence interval Returns ------- lower_mean : float lower confidence interval boundary mean : float mean value upper_mean : float upper confidence interval boundary """ # calculate quantile from confidence level in percent t_quantile = 1 - (1 - conf_level / 100.0) / 2.0 # get t value from distribution t_val = t.ppf(t_quantile, self.n - self.ddof) # calculate standard error for estimated values std_err = self.stdev / np.sqrt(self.n) lower_mean = self.mean - t_val * std_err upper_mean = self.mean + t_val * std_err return lower_mean, self.mean, upper_mean
def confidence_interval(standard_deviation,observations,confidence): confidence_fraction = (1 - (100-float(confidence))/200) if observations > 30: total_length_of_confidence_interval = (standard_deviation*2*norm.ppf(confidence_fraction)/np.sqrt(observations)) else: total_length_of_confidence_interval = (standard_deviation*2*t.ppf(confidence_fraction,observations)/np.sqrt(observations)) return total_length_of_confidence_interval
def filter_tvals(self, alpha): """ Utility function to set tvalues with an absolute value smaller than the absolute value of the alpha (critical) value to 0 Parameters ---------- alpha : scalar critical value to determine which tvalues are associated with statistically significant parameter estimates Returns ------- filtered : array n*k; new set of n tvalues for each of k variables where absolute tvalues less than the absolute value of alpha have been set to 0. """ alpha = np.abs(alpha)/2.0 n = self.n critical = t.ppf(1-alpha, n-1) subset = (self.tvalues < critical) & (self.tvalues > -1.0*critical) tvalues = self.tvalues.copy() tvalues[subset] = 0 return tvalues
def regression_analysis(self, key, info): ''' Calculates all the values we will need for simple linear regression analysis, and does the analysis itself. ''' # not the most efficient, but we want to keep these values # to calculate standard errors info = list(info) # calculate sums sumx, sumy, sumxx, sumyy, sumxy, n = (0.0, 0.0, 0.0, 0.0, 0.0, 0) for (x, y) in info: sumx += x sumy += y sumxx += x * x sumyy += y * y sumxy += x * y n += 1 # calculate correlation corr = 0 corr_denom = math.sqrt((n * sumxx - sumx**2) * (n * sumyy - sumy**2)) if corr_denom < 0.0001: yield False, "Could not calculate coefficients" corr_num = n * sumxy - sumx * sumy corr = corr_num / corr_denom if abs(corr) < 0.0001: yield False, "Could not calculate coefficients" # calculate regression coefficients beta1 = (sumxy - sumx * sumy / n) / (sumxx - sumx**2 / n) beta0 = (sumy - beta1 * sumx) / n # calculate standard errors y_reals = [y for (x, y) in info] y_hats = [beta0 + beta1 * y for y in y_reals] s_num = sum([(y - yhat) for (y, yhat) in zip(y_reals, y_hats)]) s = math.sqrt(s_num / (n - 2)) se_denom = n * sumxx - sumx**2 se_beta0 = s * math.sqrt(sumxx / se_denom) se_beta1 = s * math.sqrt(n / se_denom) # calculate t-values t0 = beta0 / se_beta0 t1 = beta1 / se_beta1 # calculate 2-sided p-values alpha = 0.05 t_stat = t.ppf(1 - alpha/2, n - 2) beta0_p_value = t.sf(abs(t0), n - 2) * 2 beta1_p_value = t.sf(abs(t1), n - 2) * 2 # output most important values in a human-readable format print("Correlation: {}".format(corr)) print("Beta 0: {}, p-value: {}".format(beta0, beta0_p_value)) print("Beta 1: {}, p-value: {}".format(beta1, beta1_p_value))
def get_intervals(values,alpha): n = len(values) mean = sum(values) / n stddev = sqrt(sum(map(lambda x: (x - mean)**2,values))/n) delta = t.ppf(1.0 - (1.0 - alpha)/2,n-1) * stddev / sqrt(n) lower = mean - delta upper = mean + delta return lower,upper
def conf_int(self, alpha=.05, cols=None, dispersion=None): ''' Returns the confidence interval of the specified theta estimates. Parameters ---------- alpha : float, optional The `alpha` level for the confidence interval. ie., `alpha` = .05 returns a 95% confidence interval. cols : tuple, optional `cols` specifies which confidence intervals to return Returns : array Each item contains [lower, upper] Example ------- >>>import numpy as np >>>from numpy.random import standard_normal as stan >>>import nipy.fixes.scipy.stats.models as SSM >>>x = np.hstack((stan((30,1)),stan((30,1)),stan((30,1)))) >>>beta=np.array([3.25, 1.5, 7.0]) >>>y = np.dot(x,beta) + stan((30)) >>>model = SSM.regression.OLSModel(x, hascons=False).fit(y) >>>model.conf_int(cols=(1,2)) Notes ----- TODO: tails : string, optional `tails` can be "two", "upper", or "lower" ''' if cols is None: lower = self.theta - t.ppf(1-alpha/2,self.df_resid) *\ np.diag(np.sqrt(self.vcov(dispersion=dispersion))) upper = self.theta + t.ppf(1-alpha/2,self.df_resid) *\ np.diag(np.sqrt(self.vcov(dispersion=dispersion))) else: lower=[] upper=[] for i in cols: lower.append(self.theta[i] - t.ppf(1-alpha/2,self.df_resid) *\ np.diag(np.sqrt(self.vcov(dispersion=dispersion)))[i]) upper.append(self.theta[i] + t.ppf(1-alpha/2,self.df_resid) *\ np.diag(np.sqrt(self.vcov(dispersion=dispersion)))[i]) return np.asarray(zip(lower,upper))
def equal_stdev(self, alpha): n1, n2, y1, y2 = self.n1, self.n2, self.y1, self.y2 Sp = np.sqrt( ((n1 - 1)*self.S1**2 + (n2 - 1)*self.S2**2) / (n1 + n2 - 2) ) t0 = (y1 - y2) / (Sp * np.sqrt(1./n1 + 1./n2)) # hypothesis testing2 H1a = t.ppf(1 - alpha/2., n1 + n2 -2) < np.abs(t0) H1b = t.ppf(1 - alpha, n1 + n2 -2) < t0 H1c = t.ppf(alpha, n1 + n2 -2) > t0 # p-value p1a = t.sf(np.abs(t0), n1 + n2 -2) * 2 p1b = t.sf(t0, n1 + n2 -2) p1c = t.cdf(t0, n1 + n2 -2) c1 = y1 - y2 - t.ppf(1 - alpha/2., n1 + n2 -2) * Sp * np.sqrt(1./n1+1./n2) c2 = y1 - y2 + t.ppf(1 - alpha/2., n1 + n2 -2) * Sp * np.sqrt(1./n1+1./n2) return H1a, H1b, H1c, p1a, p1b, p1c, (c1,c2)
# List 6-6 母平均の差の検定(母分散未知の場合) ウェルチの近似・t検定 import math import numpy as np from scipy.stats import t X = [75, 70, 89, 65, 95, 82, 62, 77, 90, 58] Y = [58, 75, 80, 70, 66, 63, 70, 76, 82, 65] m = len(X) n = len(Y) meanX = np.average(X) meanY = np.average(Y) sX = np.std(X, ddof=1) # Xの標本標準偏差 sY = np.std(Y, ddof=1) # Yの標本標準偏差 # nuの計算 nu = (((sX**2) / m + (sY**2) / n)**2) / (((((sX**2) / m)**2) / (m - 1)) + ((((sY**2) / n)**2) / (n - 1))) nuasta = round(nu) tt = (meanX - meanY) / math.sqrt((sX**2) / m + (sY**2) / n) t_lower = t.ppf(0.025, nuasta) # 自由度nu*のt分布の%値 t_upper = t.ppf(0.975, nuasta) # 自由度nu*のt分布の%値 print('t=', tt.round(4), 'reject=', (tt < t_lower) or (t_upper < tt)) # tに対するp値を計算するには p = t.cdf(-np.abs(tt), nuasta) * 2 print('p値=', p.round(4)) # 出力結果は # t= 1.2376 reject= False # p値= 0.2349
def consumption_method_1_month_cycle(product): product_data = product.consumptiondata_set.all() arr = [] for p in product_data: if p.consumptionQty != None: arr.append(p.consumptionQty) data_np_arr = np.array(arr) code = product.code desc = product.description price = float(product.price) planned_qty_2_month_mean = float(product.planned_Qty_2Month_Mean) planned_qty_1_month_mean = planned_qty_2_month_mean / 2 planned_qty_1_month_order_cost = price * planned_qty_1_month_mean planned_qty_1_month_order_cost_thousands = planned_qty_1_month_order_cost / 1000 annual_plan_cost = planned_qty_1_month_order_cost * 12 if arr != []: p_sum = data_np_arr.sum() p_mean = data_np_arr.mean() p_length = len(data_np_arr) x2 = [] for value in data_np_arr: x2.append(pow(value, 2)) x2_np_arr = np.array(x2) x2_sum = x2_np_arr.sum() square_n = (pow(p_sum, 2)) / (p_length) if p_length == 1: std_error = 0 else: var = ((x2_sum) - (square_n)) / (p_length - 1) std = np.sqrt(var) std_error = np.float(std) / np.sqrt(p_length) pb_t_test = 0.95 df = p_length - 1 t_statistic = t.ppf(pb_t_test, df) confidence_interval = t_statistic * std_error deviation_from_mean = (confidence_interval / float(p_mean)) * 100 amc = float(p_mean) amc_in_packs = amc no_of_stock_outs = 0 amc_adjusted_for_stock_outs = amc_in_packs / ( 1 - (no_of_stock_outs / 30.5)) percentage_change_in_consumption = deviation_from_mean min_amc = amc_in_packs * ( (100 - percentage_change_in_consumption) / 100) max_amc = amc_in_packs * ( (100 + percentage_change_in_consumption) / 100) poisson_mode_quantity = round(amc_in_packs) poisson_mode_qty_adjusted_for_changes_in_use = poisson_mode_quantity safety_stock = poisson_mode_qty_adjusted_for_changes_in_use * 0.5 qty_to_procure = poisson_mode_qty_adjusted_for_changes_in_use * ( 0.5 + 1) + safety_stock eff_qty_to_procure = qty_to_procure calc_1_month_cycle_qty_to_procure_adjusted_for_losses = eff_qty_to_procure calculated_1_month_cycle_cost_of_procurement = price * calc_1_month_cycle_qty_to_procure_adjusted_for_losses calculated_1_month_cycle_cost_of_procurement_thousand = calculated_1_month_cycle_cost_of_procurement / 1000 consumption_annual_procurement_cost = 12 * calculated_1_month_cycle_cost_of_procurement budget_deficit_in_plan = calculated_1_month_cycle_cost_of_procurement - float( planned_qty_1_month_order_cost) if calculated_1_month_cycle_cost_of_procurement != 0: percentage_available_funding = float( planned_qty_1_month_order_cost) / ( calculated_1_month_cycle_cost_of_procurement / 100) else: percentage_available_funding = np.nan return dict([('code', code), ('desc', desc), ('price', price), ('planned_qty_1_month_mean', planned_qty_1_month_mean), ('planned_qty_1_month_order_cost', planned_qty_1_month_order_cost), ('planned_qty_1_month_order_cost_thousands', planned_qty_1_month_order_cost_thousands), ('annual_plan_cost', annual_plan_cost), ('amc_in_packs', np.round(amc_in_packs, 2)), ('amc_adjusted_for_stock_outs', np.round(amc_adjusted_for_stock_outs, 2)), ('percentage_change_in_consumption', np.round(percentage_change_in_consumption, 2)), ('min_amc', np.round(min_amc, 2)), ('max_amc', np.round(max_amc, 2)), ('poisson_mode_quantity', poisson_mode_quantity), ('safety_stock', safety_stock), ('qty_to_procure', qty_to_procure), ('calculated_1_month_cycle_cost_of_procurement', calculated_1_month_cycle_cost_of_procurement), ('calculated_1_month_cycle_cost_of_procurement_thousand', calculated_1_month_cycle_cost_of_procurement_thousand), ('consumption_annual_procurement_cost', consumption_annual_procurement_cost), ('budget_deficit_in_plan', budget_deficit_in_plan), ('percentage_available_funding', np.round(percentage_available_funding, 2))]) else: return dict([ ('code', code), ('desc', desc), ('price', price), ('planned_qty_1_month_mean', planned_qty_1_month_mean), ('planned_qty_1_month_order_cost', planned_qty_1_month_order_cost), ('planned_qty_1_month_order_cost_thousands', planned_qty_1_month_order_cost_thousands), ('annual_plan_cost', annual_plan_cost), ('amc_in_packs', None), ('amc_adjusted_for_stock_outs', None), ('percentage_change_in_consumption', None), ('min_amc', None), ('max_amc', None), ('poisson_mode_quantity', None), ('safety_stock', None), ('qty_to_procure', None), ('calculated_1_month_cycle_cost_of_procurement', None), ('calculated_1_month_cycle_cost_of_procurement_thousand', None), ('consumption_annual_procurement_cost', None), ('budget_deficit_in_plan', None), ('percentage_available_funding', None) ])
def evaluate(self, current_configuration: Configuration, experiment: Experiment): """ Return number of measurements to finish Configuration or 0 if it finished. In other case - compute result as average between all experiments. :param current_configuration: instance of Configuration class :param experiment: instance of 'experiment' is required for experiment-awareness. :return: int min_tasks_per_configuration if Configuration was not measured at all or 1 if Configuration was not measured precisely or 0 if it finished """ tasks_data = current_configuration.get_tasks() if len(tasks_data) == 0: return 1 c_c_results = current_configuration.results c_s_results = experiment.get_current_solution().results c_c_results_l = [] c_s_results_l = [] for key in experiment.get_objectives(): c_c_results_l.append(c_c_results[key]) c_s_results_l.append(c_s_results[key]) if len(tasks_data) < self.min_tasks_per_configuration: if self.is_experiment_aware: ratios = [ cur_config_dim / cur_solution_dim for cur_config_dim, cur_solution_dim in zip( c_c_results_l, c_s_results_l) ] if all([ ratio >= ratio_max for ratio, ratio_max in zip(ratios, self.ratios_max) ]): return 0 return self.min_tasks_per_configuration - len(tasks_data) elif len(tasks_data) >= self.max_tasks_per_configuration: return 0 else: # Calculating standard deviation all_dim_std = current_configuration.get_standard_deviation() # The number of Degrees of Freedom generally equals the number of observations (Tasks) minus # the number of estimated parameters. degrees_of_freedom = len(tasks_data) - len(c_c_results_l) # Calculate the critical t-student value from the t distribution student_coefficients = [ t.ppf(c_l, df=degrees_of_freedom) for c_l in self.confidence_levels ] # Calculating confidence interval for each dimension, that contains a confidence intervals for # singular measurements and confidence intervals for multiple measurements. # First - singular measurements errors: conf_intervals_sm = [] for c_l, d_s_a, d_a_c, avg in zip(self.confidence_levels, self.device_scale_accuracies, self.device_accuracy_classes, c_c_results_l): d = sqrt((c_l * d_s_a / 2)**2 + (d_a_c * avg / 100)**2) conf_intervals_sm.append(c_l * d) # Calculation of confidence interval for multiple measurements: conf_intervals_mm = [] for student_coefficient, dim_skd in zip(student_coefficients, all_dim_std): conf_intervals_mm.append(student_coefficient * dim_skd / sqrt(len(tasks_data))) # confidence interval, or in other words absolute error absolute_errors = [] for c_i_ss, c_i_mm in zip(conf_intervals_sm, conf_intervals_mm): absolute_errors.append(sqrt(pow(c_i_ss, 2) + pow(c_i_mm, 2))) # Calculating relative error for each dimension relative_errors = [] for interval, avg_res in zip(absolute_errors, c_c_results_l): if not avg_res: # it is 0 or 0.0 # if new use-cases appear with the same behaviour. if interval == 0: avg_res = 1 # Anyway relative error will be 0 and avg will not be changed. else: return 1 relative_errors.append(interval / avg_res * 100) # Thresholds for relative errors that should not be exceeded for accurate measurement. thresholds = [] if self.is_experiment_aware: # We adapt thresholds objectives_minimization = experiment.get_objectives_minimization( ) for i in range(len(objectives_minimization)): if objectives_minimization[i]: if not c_s_results_l[i]: ratio = 1 else: ratio = c_c_results_l[i] / c_s_results_l[i] else: if not c_c_results_l[i]: ratio = 1 else: ratio = c_s_results_l[i] / c_c_results_l[i] adopted_threshold = \ self.base_acceptable_errors[i] \ + (self.max_acceptable_errors[i] - self.base_acceptable_errors[i]) \ / (1 + exp(- (10 / self.ratios_max[i]) * (ratio - self.ratios_max[i] / 2))) thresholds.append(adopted_threshold) else: # Or we don't adapt thresholds for acceptable_error in self.base_acceptable_errors: thresholds.append(acceptable_error) # Simple implementation of possible multi-dim Repeater decision making: # If any of resulting dimensions are not accurate - just terminate. for threshold, error in zip(thresholds, relative_errors): if error > threshold: return 1 return 0
UR2_sim = np.random.normal(UR20, sig_UR2, N) R1_sim = np.random.normal(R10, sig_R1, N) R2_sim = np.random.normal(R20, sig_R2, N) # Berechnung der Zielgröße und der statistischen Kennwerte gamma02_sim = gamma01_sim*(1+alpha0*(temp1_sim-T_0))/(1+alpha0*(temp2_sim-T_0))*\ (UR2_sim/UR1_sim)*((U10*R1_sim)/(U20*R2_sim)) Gmean = np.mean(gamma02_sim) Gstd = np.std(gamma02_sim, ddof=1) Gplot = np.arange(1 - 0.05, 1 + 0.05, 0.001) fsim = norm.pdf(Gplot, Gmean, Gstd) Fsim = norm.cdf(Gplot, Gmean, Gstd) # Toleranz als Prognoseintervall (Mittelwert und Varianz unbekannt) c1 = t.ppf((1 - GAMMA) / 2, N - 1) c2 = t.ppf((1 + GAMMA) / 2, N - 1) TGMC1 = Gstd * np.sqrt(1 + 1 / N) * (c2 - c1) print(' ') print('Toleranzbereich bei Monte-Carlo-Simulation mit Prognoseintervall: ', round(TGMC1, 4)) """ Grafische Darstellung der Simulation """ fig = plt.figure(3, figsize=(12, 4)) fig.suptitle('Ergebnisse der statistischen Simulation') ax1, ax2 = fig.subplots(1, 2) ax1.plot(gamma02_sim, 'r+') #ax1.axis([0,N,2.35,2.65]) ax1.set_xlabel('Stichprobe $n$') ax1.set_ylabel('Ausgangsspannung $U$ / V') ax1.grid(True) ax2.hist(gamma02_sim, int(np.sqrt(N)), density=True, facecolor='b')
def _two_sample_ttest_for_stacked_data(table, response_cols, factor_col, alternatives, first=None , second=None , hypo_diff=0, equal_vari='pooled', confi_level=0.95): if(type(table[factor_col][0]) != str): if(type(table[factor_col][0]) == bool): if(first != None): first = bool(first) if(second != None): second = bool(second) else: if(first != None): first = float(first) if(second != None): second = float(second) if(first == None or second == None): tmp_factors = [] if(first != None): tmp_factors += [first] if(second != None): tmp_factors += [second] for i in range(len(table[factor_col])): if(table[factor_col][i] != None and table[factor_col][i] not in tmp_factors): if(len(tmp_factors) == 2): raise Exception("There are more that 2 factors.") else: tmp_factors += [table[factor_col][i]] if(first == None): if(tmp_factors[0] != second): first = tmp_factors[0] else: first = tmp_factors[1] if(second == None): if(tmp_factors[0] != first): second = tmp_factors[0] else: second = tmp_factors[1] table_first = table[table[factor_col] == first] table_second = table[table[factor_col] == second] tmp_table = [] rb = BrtcReprBuilder() rb.addMD(strip_margin(""" ## Two Sample T Test for Stacked Data Result | - Hypothesized mean = {hypo_diff} | - Confidence level = {confi_level} """.format(hypo_diff=hypo_diff, confi_level=confi_level))) for response_col in response_cols: tmp_model = [] number1 = len(table_first[response_col]) number2 = len(table_second[response_col]) mean1 = (table_first[response_col]).mean() mean2 = (table_second[response_col]).mean() std1 = (table_first[response_col]).std() std2 = (table_second[response_col]).std() start_auto = 0 if(equal_vari == 'auto'): start_auto = 1 f_value = (std1 ** 2) / (std2 ** 2) f_test_p_value_tmp = stats.f.cdf(1 / f_value, number1 - 1, number2 - 1) if(f_test_p_value_tmp > 0.5): f_test_p_value = (1 - f_test_p_value_tmp) * 2 else: f_test_p_value = f_test_p_value_tmp * 2 if(f_test_p_value < 0.05): equal_vari = 'unequal' else: equal_vari = 'pooled' ttestresult = ttest_ind(table_first[response_col], table_second[response_col], 'larger', usevar=equal_vari, value=hypo_diff) if 'larger' in alternatives: ttestresult = ttest_ind(table_first[response_col], table_second[response_col], 'larger', usevar=equal_vari, value=hypo_diff) df = ttestresult[2] if(equal_vari == 'pooled'): std_number1number2 = sqrt(((number1 - 1) * (std1) ** 2 + (number2 - 1) * (std2) ** 2) / (number1 + number2 - 2)) margin = t.ppf((confi_level) , df) * std_number1number2 * sqrt(1 / number1 + 1 / number2) if(equal_vari == 'unequal'): margin = t.ppf((confi_level) , df) * sqrt(std1 ** 2 / (number1) + std2 ** 2 / (number2)) tmp_model += [['true difference in means > 0.0'] + [ttestresult[1]] + [(mean1 - mean2 - margin, math.inf)]] tmp_table += [['%s by %s(%s,%s)' % (response_col, factor_col, first, second)] + ['true difference in means > 0.0'] + ['t statistic, t distribution with %f degrees of freedom under the null hypothesis' % ttestresult[2]] + [ttestresult[0]] + [ttestresult[1]] + [confi_level] + [mean1 - mean2 - margin] + [math.inf]] if 'smaller' in alternatives: ttestresult = ttest_ind(table_first[response_col], table_second[response_col], 'smaller', usevar=equal_vari, value=hypo_diff) df = ttestresult[2] if(equal_vari == 'pooled'): std_number1number2 = sqrt(((number1 - 1) * (std1) ** 2 + (number2 - 1) * (std2) ** 2) / (number1 + number2 - 2)) margin = t.ppf((confi_level) , df) * std_number1number2 * sqrt(1 / number1 + 1 / number2) if(equal_vari == 'unequal'): margin = t.ppf((confi_level) , df) * sqrt(std1 ** 2 / (number1) + std2 ** 2 / (number2)) tmp_model += [['true difference in means < 0.0'] + [ttestresult[1]] + [(-math.inf, mean1 - mean2 + margin)]] tmp_table += [['%s by %s(%s,%s)' % (response_col, factor_col, first, second)] + ['true difference in means < 0.0'] + ['t statistic, t distribution with %f degrees of freedom under the null hypothesis' % ttestresult[2]] + [ttestresult[0]] + [ttestresult[1]] + [confi_level] + [-math.inf] + [mean1 - mean2 + margin]] if 'two-sided' in alternatives: ttestresult = ttest_ind(table_first[response_col], table_second[response_col], 'two-sided', usevar=equal_vari, value=hypo_diff) df = ttestresult[2] if(equal_vari == 'pooled'): std_number1number2 = sqrt(((number1 - 1) * (std1) ** 2 + (number2 - 1) * (std2) ** 2) / (number1 + number2 - 2)) margin = t.ppf((confi_level + 1) / 2 , df) * std_number1number2 * sqrt(1 / number1 + 1 / number2) if(equal_vari == 'unequal'): margin = t.ppf((confi_level + 1) / 2 , df) * sqrt(std1 ** 2 / (number1) + std2 ** 2 / (number2)) tmp_model += [['true difference in means != 0.0'] + [ttestresult[1]] + [(mean1 - mean2 - margin, mean1 - mean2 + margin)]] tmp_table += [['%s by %s(%s,%s)' % (response_col, factor_col, first, second)] + ['true difference in means != 0.0'] + ['t statistic, t distribution with %f degrees of freedom under the null hypothesis' % ttestresult[2]] + [ttestresult[0]] + [ttestresult[1]] + [confi_level] + [mean1 - mean2 - margin] + [mean1 - mean2 + margin]] result_model = pd.DataFrame.from_records(tmp_model) result_model.columns = ['alternative hypothesis', 'p-value', '%g%% confidence interval' % (confi_level * 100)] rb.addMD(strip_margin(""" | #### Data = {response_col} by {factor_col}({first},{second}) | - Statistics = t statistic, t distribution with {ttestresult2} degrees of freedom under the null hypothesis | - t-value = {ttestresult0} | | {result_model} | """.format(ttestresult2=ttestresult[2], response_col=response_col, factor_col=factor_col, first=first, second=second, ttestresult0=ttestresult[0], result_model=pandasDF2MD(result_model)))) if(start_auto == 1): equal_vari = 'auto' result = pd.DataFrame.from_records(tmp_table) result.columns = ['data', 'alternative_hypothesis', 'statistics', 'estimates', 'p_value', 'confidence_level', 'lower_confidence_interval', 'upper_confidence_interval'] model = dict() model['_repr_brtc_'] = rb.get() return {'out_table' : result, 'model' : model}
# ax.set(ylabel="Packets", xlabel='Consumers') # ax.grid(axis='y') # fig.xticks = 'Consumers' # fig.yticks = 'Hits' # plt.xticks(rotation=0) # ax2 = ax.twinx() # plot = ax2.plot(ax.get_xticks(), out_hits[['prodRec']], marker='.', markeredgecolor='black') # ax2.set_ylabel(r"Names") # plot[0].get_figure().savefig('data/' + output + '_content.png', bbox_inches='tight') # Average content received per consumer # Confidence content retrieval # 95 confidence interval bw simulations - hit_c[mode][simulations] confidence = 0.95 hits_c[mode] = [ sem(hit_c[mode][s]) * t.ppf((1 + confidence) / 2, len(hit_c[mode][s]) - 1) for s in range(simulations) ] # a_con = [hits[mode][i] / consum[i] for i in range(simulations)] # # Plot content retrieved # out_hits2 = pd.DataFrame({'hits': a_con}, index=consum) # out_hits2_c = pd.DataFrame({'hits': hits_c[mode]}, index=consum) # out_hits2 = out_hits2.sort_index() # out_hits2_c = out_hits2_c.sort_index() # out_hits2.to_csv('data/' + output + '_a_content.csv') # # Create figure and plot first axis # fig2 = plt.figure(figsize=[12, 8]) # ax = out_hits2.plot.bar(yerr=out_hits2_c, title="Content retrieved per consumer", ax=fig2.add_subplot(111)) # ax.set(ylabel="Packets", xlabel='Consumers') # ax.grid(axis='y')
def coeffs_criterias(yi, x, y): global b, m k = len(x[0]) mx = [[] for i in range(len(x) + 1)] mx[0].append(k) for i in range(1, len(x) + 1): suma = round(sum(x[i - 1]), 5) mx[0].append(suma) mx[i].append(suma) for j in range(0, len(x)): mx[i].append( round(sum([round(x[i - 1][l] * x[j][l], 5) for l in range(k)]), 5)) det = numpy.linalg.det(mx) delta = round(det, 5) my = [round(sum(yi), 5)] for i in range(len(x)): my.append(round(sum([yi[j] * x[i][j] for j in range(k)]), 5)) b = [copy.deepcopy(mx) for i in range(len(x) + 1)] for i in range(len(x) + 1): for j in range(len(x) + 1): b[i][j][i] = my[j] b[i] = round(numpy.linalg.det(b[i]) / delta, 5) print("b" + str(i) + ": " + str(b[i])) S2 = [] for i in range(len(y)): S2.append(sum([(y[i][j] - yi[i])**2 for j in range(len(y[i]))])) S2[i] = round(S2[i] / len(y[i]), 3) print("S2: " + str(S2)) Gp = round(max(S2) / sum(S2), 3) print("Gp: " + str(Gp)) f1 = m - 1 f2 = k print("f1:" + str(f1)) print("f2:" + str(f2)) alpha = 0.05 Gcr = round(cochran(f1, f2, alpha), 4) print("Gcr: " + str(Gcr)) if Gp < Gcr: print("Cochran's C: OK") else: print("Cochran's C: :(") m += 1 return generate_y(x) S2v = sum(S2) / 4 S2b = round(S2v / (4 * m), 3) Sb = round(math.sqrt(S2b), 3) f3 = f1 * f2 print("f3: " + str(f3)) tcr = round(t.ppf(1 - alpha / 2, df=f3), 3) print("t: " + str(tcr)) bs = [] ts = [] d = 0 bs.append(round(sum([yi[j] for j in range(len(yi))]) / len(yi), 3)) ts.append(round(bs[0] / Sb, 3)) if ts[0] < 0: ts[0] *= -1 if ts[0] > tcr: ts[0] = True d += 1 else: ts[0] = False for i in range(len(x)): bs.append( round(sum([yi[j] * x[i][j] for j in range(len(yi))]) / len(yi), 3)) ts.append(round(bs[i + 1] / Sb, 3)) if ts[i + 1] < 0: ts[i + 1] *= -1 if ts[i + 1] > tcr: ts[i + 1] = True d += 1 else: ts[i + 1] = False print("Чи значимі b: " + str(ts)) f4 = k - d print("f4: " + str(f4)) yj = [] b0 = [] for i in range(len(b)): if ts[i]: b0.append(b[i]) else: b0.append(0) for j in range(k): yj.append( round( b0[0] + sum([x[i - 1][j] * b0[i] for i in range(1, len(b0))]), 3)) print("yj: " + str(yj)) S2ad = round(m * sum([(yj[i] - yi[i])**2 for i in range(4)]) / f4, 3) Fp = round(S2ad / S2v, 3) print("Fp: " + str(Fp)) Fcr = round(f.ppf(1 - alpha, f4, f3), 1) print("Fcr: " + str(Fcr)) if Fp < Fcr: print("F-criteria: OK") else: print("F-criteria: :(") start(x)
def main(n, m): x1min = -30 x1max = 0 x2min = 10 x2max = 60 x3min = 10 x3max = 35 x01 = (x1max + x1min) / 2 x02 = (x2max + x2min) / 2 x03 = (x3max + x3min) / 2 deltax1 = x1max - x01 deltax2 = x2max - x02 deltax3 = x3max - x03 xn = [[-1, -1, -1, +1, +1, +1, -1, +1, +1, +1], [-1, -1, +1, +1, -1, -1, +1, +1, +1, +1], [-1, +1, -1, -1, +1, -1, +1, +1, +1, +1], [-1, +1, +1, -1, -1, +1, -1, +1, +1, +1], [+1, -1, -1, -1, -1, +1, +1, +1, +1, +1], [+1, -1, +1, -1, +1, -1, -1, +1, +1, +1], [+1, +1, -1, +1, -1, -1, -1, +1, +1, +1], [+1, +1, +1, +1, +1, +1, +1, +1, +1, +1], [-1.73, 0, 0, 0, 0, 0, 0, 2.9929, 0, 0], [+1.73, 0, 0, 0, 0, 0, 0, 2.9929, 0, 0], [0, -1.73, 0, 0, 0, 0, 0, 0, 2.9929, 0], [0, +1.73, 0, 0, 0, 0, 0, 0, 2.9929, 0], [0, 0, -1.73, 0, 0, 0, 0, 0, 0, 2.9929], [0, 0, +1.73, 0, 0, 0, 0, 0, 0, 2.9929], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]] x1 = [x1min, x1min, x1min, x1min, x1max, x1max, x1max, x1max, -1.73 * deltax1 + x01, 1.73 * deltax1 + x01, x01, x01, x01, x01, x01] x2 = [x2min, x2min, x2max, x2max, x2min, x2min, x2max, x2max, x02, x02, -1.73 * deltax2 + x02, 1.73 * deltax2 + x02, x02, x02, x02] x3 = [x3min, x3max, x3min, x3max, x3min, x3max, x3min, x3max, x03, x03, x03, x03, -1.73 * deltax3 + x03, 1.73 * deltax3 + x03, x03] x1x2 = [0] * 15 x1x3 = [0] * 15 x2x3 = [0] * 15 x1x2x3 = [0] * 15 x1kv = [0] * 15 x2kv = [0] * 15 x3kv = [0] * 15 for i in range(15): x1x2[i] = x1[i] * x2[i] x1x3[i] = x1[i] * x3[i] x2x3[i] = x2[i] * x3[i] x1x2x3[i] = x1[i] * x2[i] * x3[i] x1kv[i] = x1[i] ** 2 x2kv[i] = x2[i] ** 2 x3kv[i] = x3[i] ** 2 list_for_a = round_matrix(list(zip(x1, x2, x3, x1x2, x1x3, x2x3, x1x2x3, x1kv, x2kv, x3kv))) planning_matrix_with_naturalized_coeffs_x = PrettyTable() planning_matrix_with_naturalized_coeffs_x.title = 'Матриця планування з натуралізованими коефіцієнтами X' planning_matrix_with_naturalized_coeffs_x.field_names = ['X1', 'X2', 'X3', 'X1X2', 'X1X3', 'X2X3', 'X1X2X3', 'X1X1', 'X2X2', 'X3X3'] planning_matrix_with_naturalized_coeffs_x.add_rows(list_for_a) print(planning_matrix_with_naturalized_coeffs_x) Y = round_matrix( [[function(list_for_a[j][0], list_for_a[j][1], list_for_a[j][2]) for i in range(m)] for j in range(15)]) planning_matrix_y = PrettyTable() planning_matrix_y.title = 'Матриця планування Y' planning_matrix_y.field_names = ['Y1', 'Y2', 'Y3'] planning_matrix_y.add_rows(Y) print(planning_matrix_y) Y_average = [] for i in range(len(Y)): Y_average.append(np.mean(Y[i], axis=0)) print("Середні значення відгуку за рядками:") for i in range(15): print("{:.3f}".format(Y_average[i]), end=" ") dispersions = [] for i in range(len(Y)): a = 0 for k in Y[i]: a += (k - np.mean(Y[i], axis=0)) ** 2 dispersions.append(a / len(Y[i])) def find_known(num): a = 0 for j in range(15): a += Y_average[j] * list_for_a[j][num - 1] / 15 return a def a(first, second): a = 0 for j in range(15): a += list_for_a[j][first - 1] * list_for_a[j][second - 1] / 15 return a my = sum(Y_average) / 15 mx = [] for i in range(10): number_lst = [] for j in range(15): number_lst.append(list_for_a[j][i]) mx.append(sum(number_lst) / len(number_lst)) det1 = [ [1, mx[0], mx[1], mx[2], mx[3], mx[4], mx[5], mx[6], mx[7], mx[8], mx[9]], [mx[0], a(1, 1), a(1, 2), a(1, 3), a(1, 4), a(1, 5), a(1, 6), a(1, 7), a(1, 8), a(1, 9), a(1, 10)], [mx[1], a(2, 1), a(2, 2), a(2, 3), a(2, 4), a(2, 5), a(2, 6), a(2, 7), a(2, 8), a(2, 9), a(2, 10)], [mx[2], a(3, 1), a(3, 2), a(3, 3), a(3, 4), a(3, 5), a(3, 6), a(3, 7), a(3, 8), a(3, 9), a(3, 10)], [mx[3], a(4, 1), a(4, 2), a(4, 3), a(4, 4), a(4, 5), a(4, 6), a(4, 7), a(4, 8), a(4, 9), a(4, 10)], [mx[4], a(5, 1), a(5, 2), a(5, 3), a(5, 4), a(5, 5), a(5, 6), a(5, 7), a(5, 8), a(5, 9), a(5, 10)], [mx[5], a(6, 1), a(6, 2), a(6, 3), a(6, 4), a(6, 5), a(6, 6), a(6, 7), a(6, 8), a(6, 9), a(6, 10)], [mx[6], a(7, 1), a(7, 2), a(7, 3), a(7, 4), a(7, 5), a(7, 6), a(7, 7), a(7, 8), a(7, 9), a(7, 10)], [mx[7], a(8, 1), a(8, 2), a(8, 3), a(8, 4), a(8, 5), a(8, 6), a(8, 7), a(8, 8), a(8, 9), a(8, 10)], [mx[8], a(9, 1), a(9, 2), a(9, 3), a(9, 4), a(9, 5), a(9, 6), a(9, 7), a(9, 8), a(9, 9), a(9, 10)], [mx[9], a(10, 1), a(10, 2), a(10, 3), a(10, 4), a(10, 5), a(10, 6), a(10, 7), a(10, 8), a(10, 9), a(10, 10)]] det2 = [my, find_known(1), find_known(2), find_known(3), find_known(4), find_known(5), find_known(6), find_known(7), find_known(8), find_known(9), find_known(10)] beta = solve(det1, det2) print("\nОтримане рівняння регресії:") print("{:.3f} + {:.3f} * X1 + {:.3f} * X2 + {:.3f} * X3 + {:.3f} * Х1X2 + {:.3f} * Х1X3 + {:.3f} * Х2X3" "+ {:.3f} * Х1Х2X3 + {:.3f} * X11^2 + {:.3f} * X22^2 + {:.3f} * X33^2 = ŷ" .format(beta[0], beta[1], beta[2], beta[3], beta[4], beta[5], beta[6], beta[7], beta[8], beta[9], beta[10])) y_i = [0] * 15 print("Експериментальні значення:") for k in range(15): y_i[k] = beta[0] + beta[1] * list_for_a[k][0] + beta[2] * list_for_a[k][1] + beta[3] * list_for_a[k][2] + \ beta[4] * list_for_a[k][3] + beta[5] * list_for_a[k][4] + beta[6] * list_for_a[k][5] + beta[7] * \ list_for_a[k][6] + beta[8] * list_for_a[k][7] + beta[9] * list_for_a[k][8] + beta[10] * list_for_a[k][ 9] for i in range(15): print("{:.3f}".format(y_i[i]), end=" ") start1 = time.time() print("\n\nПеревірка за критерієм Кохрена") Gp = max(dispersions) / sum(dispersions) Gt = 0.3346 print("Gp =", Gp) if Gp < Gt: print("Дисперсія однорідна") else: print("Дисперсія неоднорідна") end1 = time.time() start2 = time.time() print("\nПеревірка значущості коефіцієнтів за критерієм Стьюдента") sb = sum(dispersions) / len(dispersions) sbs = (sb / (15 * m)) ** 0.5 F3 = (m - 1) * n coefs1 = [] coefs2 = [] d = 11 res = [0] * 11 for j in range(11): t_pract = 0 for i in range(15): if j == 0: t_pract += Y_average[i] / 15 else: t_pract += Y_average[i] * xn[i][j - 1] res[j] = beta[j] if fabs(t_pract / sbs) < t.ppf(q=0.975, df=F3): coefs2.append(beta[j]) res[j] = 0 d -= 1 else: coefs1.append(beta[j]) print("Значущі коефіцієнти регресії:", [round(i, 3) for i in coefs1]) print("Незначущі коефіцієнти регресії:", [round(i, 3) for i in coefs2]) y_st = [] for i in range(15): y_st.append(res[0] + res[1] * x1[i] + res[2] * x2[i] + res[3] * x3[i] + res[4] * x1x2[i] + res[5] * x1x3[i] + res[6] * x2x3[i] + res[7] * x1x2x3[i] + res[8] * x1kv[i] + res[9] * x2kv[i] + res[10] * x3kv[i]) print("Значення з отриманими коефіцієнтами:") for i in range(15): print("{:.3f}".format(y_st[i]), end=" ") end2 = time.time() start3 = time.time() print("\n\nПеревірка адекватності за критерієм Фішера") Sad = m * sum([(y_st[i] - Y_average[i]) ** 2 for i in range(15)]) / (n - d) Fp = Sad / sb F4 = n - d print("Fp =", Fp) if Fp < f.ppf(q=0.95, dfn=F4, dfd=F3): print("Рівняння регресії адекватне при рівні значимості 0.05") else: print("Рівняння регресії неадекватне при рівні значимості 0.05") end3 = time.time() print('-----------------------------------------------------------------------------------------------------') time_cohren = end1 - start1 time_student = end2 - start2 time_fisher = end3 - start3 print("Час початку перевірки за критерієм Кохрена", start1) print("Час закінчення перевірки за критерієм Кохрена", end1) print("--- Час виконання перевірки за критерієм Кохрена: %s seconds ---" % time_cohren) print() print("Час початку перевірки за критерієм Стьюдента", start2) print("Час закінчення перевірки за критерієм Стьюдента", end2) print("--- Час виконання перевірки за критерієм Стьюдента: %s seconds ---" % time_student) print() print("Час початку перевірки за критерієм Фішера", start3) print("Час закінчення перевірки за критерієм Фішера", end3) print("--- Час виконання перевірки за критерієм Фішера: %s seconds ---" % time_fisher) print()
report_path = sys.argv[2] tables_path = sys.argv[3] # read report files for each ml algorithm df_dict = {'knn' : pd.read_csv(report_path+'precision_recall_knn_experiments.csv'), 'dt' : pd.read_csv(report_path+'precision_recall_dt_experiments.csv'), 'rf' : pd.read_csv(report_path+'precision_recall_rf_experiments.csv')} # generate LaTeX code and save in text file for ml in ['knn','dt','rf']: precision = df_dict[ml][[f'test{x} precision' for x in range(1,noOfTests+1)]] recall = df_dict[ml][[f'test{x} recall' for x in range(1,noOfTests+1)]] # calculate mean and 99.9% error interval for precision and recall df_dict[ml]['precision mean'] = precision.mean(axis = 1) df_dict[ml]['precision error'] = t.ppf(.999, noOfTests-1) * ( precision.std(axis = 1) / np.sqrt(noOfTests)) df_dict[ml]['recall mean'] = recall.mean(axis = 1) df_dict[ml]['recall error'] = t.ppf(.999, noOfTests-1) * ( recall.std(axis = 1) / np.sqrt(noOfTests)) # LaTeX code generation head = '\\begin{table}[htpb]\n\centering\n\\resizebox{\\textwidth}{!}{%\n' table = '\\begin{tabular}{l'+'c'*(noOfTests*2)+'cc}\n\cline{2-'+str(noOfTests*2+3)+'}\n' title1 = '\multicolumn{1}{c}{\\textbf{}} & ' title2 = '\\textbf{Features}' for i in range(1,noOfTests+1): title1 += '\multicolumn{2}{c}{\\textbf{Test '+str(i)+'}} & ' title2 += ' & \\textbf{Precision} & \\textbf{Recall}' title1 += '\multicolumn{2}{c}{\\textbf{\\begin{tabular}[c]{@{}c@{}}Confidence\\\\ Interval 99\%\end{tabular}}} \\\\ \hline \n' title2 += ' & \\textbf{Precision} & \\textbf{Recall} \\\\ \hline \hline \n' body = ''
(-1) + y8av8 * 1) / 8 beta7 = (y1av1 * (-1) + y2av2 * 1 + y3av3 * 1 + y4av4 * (-1) + y5av5 * 1 + y6av6 * (-1) + y7av7 * (-1) + y8av8 * 1) / 8 t0 = abs(beta0) / sbs t1 = abs(beta1) / sbs t2 = abs(beta2) / sbs t3 = abs(beta3) / sbs t4 = abs(beta4) / sbs t5 = abs(beta5) / sbs t6 = abs(beta6) / sbs t7 = abs(beta7) / sbs f3 = f1 * f2 ttabl = round(abs(t.ppf(q / 2, f3)), 4) d = 8 if t0 < ttabl: print("t0<ttabl, b0 не значимий") b0 = 0 d = d - 1 if t1 < ttabl: print("t1<ttabl, b1 не значимий") b1 = 0 d = d - 1 if t2 < ttabl: print("t2<ttabl, b2 не значимий") b2 = 0 d = d - 1 if t3 < ttabl:
def mean_confidence_interval(data, confidence=0.95): a = 1.0 * np.array(data) n = len(a) m, se = np.mean(a), sem(a, ddof=n-1) h = se * t.ppf((1 + confidence) / 2., n-1) return m, m-h, m+h
i += 1 log[j * 50].append(cost) #seconds #%% """ DATA MINING """ data = pd.DataFrame.from_dict(log).swapaxes(0, 1) mean = data.mean(numeric_only=True, axis=1) std = data.std(axis=1) data["Mean (sec.)"] = mean data["STD"] = std data.reset_index(inplace=True) data = data.rename(columns={'index': 'Size(D)'}) data = data[['Size(D)', 'Mean (sec.)', "STD"]] data["Sm"] = (data["STD"] / np.sqrt((data['Size(D)']))) data["h_95"] = (data["Sm"] * t.ppf((1 + 0.95) / 2, data["Size(D)"] - 1)) data["h_90"] = (data["Sm"] * t.ppf((1 + 0.90) / 2, data["Size(D)"] - 1)) data.plot.line(x="Size(D)", y="Mean (sec.)") #%% """ ISCOVER CHECKS IF IT IS COVER FOR THE SAKE OF CORRECTNESS MAIN_TEST GUARANTEES CORRECT INPUT """ def isCover(universe, Cover): for subset in Cover: universe = universe - subset
def get_student_value(f3, significance): from _pydecimal import Decimal from scipy.stats import t return Decimal(abs(t.ppf(significance / 2, f3))).quantize(Decimal('.0001')).__float__()
Sb = sum(syList) / N S = math.sqrt(Sb / (N * m)) bettaList = [ sum([syList[i] * normValuesOfX0[i] for i in range(N)]) / N, sum([syList[i] * normValuesOfX1[i] for i in range(N)]) / N, sum([syList[i] * normValuesOfX2[i] for i in range(N)]) / N, sum([syList[i] * normValuesOfX3[i] for i in range(N)]) / N ] bettaList = [round(i, 2) for i in bettaList] tList = [bettaList[i] * S for i in range(N)] for i in range(N): if tList[i] < t.ppf( q=0.975, df=f3): # перевірка за критерієм Стьюдента з використанням scipy bList[i] = 0 d -= 1 print('Виключаємо з рівняння коефіціент b' + str(i)) print("y = " + str(bList[0]) + ' + (' + str(bList[1]) + ") * x1 + (" + str(bList[2]) + ") * x2 + (" + str(bList[3]) + ") * x3") # Критерій Фішера print("=================Критерій Фішера=================") f4 = N - d S_ad = (m * sum([(bList[0] + bList[1] * x1List[i] + bList[2] * x2List[i] + bList[3] * x3List[i] - avgYList[i])**2 for i in range(N)]) / f4) Fp = S_ad / Sb
#Get predictions from training data Y_train_pred=[y(val[0]) for val in X_train] #Get degrees of freedom deg_f=len(Y_train_pred)-3 #Compute MSres from training data pred_true_df=pd.DataFrame({'Pred.':Y_train_pred, 'True':[val[0] for val in Y_train]}) pred_true_df['Resid_sqr']=pred_true_df.apply(lambda row: (row['Pred.']-row['True'])**2, axis=1) RSS=sum(pred_true_df['Resid_sqr']) MSres=RSS/deg_f #Get tc critical value from t distribution t_c=t.ppf(.025, df=deg_f) #Save training data for plotting Y_train_plot=[val[0] for val in Y_train] X_train_plot=[val[0] for val in X_train] #Propend column of 1s to data array and interpret as matrix X_train=np.asmatrix([[1,val[0],val[1]] for val in X_train]) X_train_T=X_train.transpose() C=np.dot(X_train_T,X_train).I #Upper confidence window for y def y_up(x): A=np.asmatrix([1,x,x**2]) se=math.sqrt(MSres*np.dot(np.dot(A,C),A.transpose())) return(y(x)+t_c*se)
# grid for the degrees of freedom parameter nu_vec_cop = np.arange(nu_min_copula, nu_max_copula + 1) l_ = len(nu_vec_cop) # initialize variables rho2_copula_vec = np.zeros((i_, i_, l_)) llike_nu = np.zeros(l_) epsi_tilde = np.zeros((t_, i_, l_)) db_estimation_copula = {} for l in range(l_): # calculate standardized invariants for i in range(i_): epsi_tilde[:, i, l] = tstu.ppf(u[:, i], nu_vec_cop[l]) # estimate copula parameters with maximum likelihood _, sig2 = \ fit_locdisp_mlfp_difflength(epsi_tilde[:, :, l], p=p_copula, nu=nu_vec_cop[l], threshold=10 ** -3, maxiter=1000) # shrinkage: factor analysis beta, delta2 = factor_analysis_paf(sig2, k_) sig2_fa = beta @ beta.T + np.diag(delta2) # compute correlation matrix rho2_copula_vec[:, :, l], _ = cov_2_corr(sig2_fa)
import numpy as np from scipy.stats import t melons = [7.72, 9.58, 12.38, 7.77, 11.27, 8.80, 11.10, 7.80, 10.17, 6.00] melons = np.array(melons) xbar = np.mean(melons) s_x = np.std(melons, ddof=1) alpha = 0.05 n = np.size(melons) t_statistic = t.ppf(alpha / 2.0, n -1) confid_lower = xbar - abs(t_statistic) * s_x / np.sqrt(n) confid_upper = xbar + abs(t_statistic) * s_x / np.sqrt(n) print("T-statistic: ", t_statistic) print("Confid. interval: ", [confid_lower, confid_upper])
# We compute the coeff. beta_0, beta_1 = np.linalg.lstsq(XX, yy, rcond=None)[0] beta = [beta_0, beta_1] # Calculate the SSE SSE = np.linalg.lstsq(XX, yy, rcond=None)[1] # We get confidence interval alpha = 0.05 x0 = np.linspace(7, 15, 50) X0 = np.array([np.ones(len(x0)), x0]).T aux_t_conf = np.sqrt(SSE / (n - p) * (np.diag(X0 @ np.linalg.inv(XX.T @ XX) @ X0.T))) yy0_hat = X0 @ np.array([beta_0, beta_1]) upp_conf = yy0_hat + t.ppf(1 - alpha / 2, n - p) * aux_t_conf low_conf = yy0_hat - t.ppf(1 - alpha / 2, n - p) * aux_t_conf # We get prediction interval aux_t_pred = np.sqrt(SSE / (n - p) * (1 + np.diag(X0 @ np.linalg.inv(XX.T @ XX) @ X0.T))) yy0_hat = X0 @ np.array([beta_0, beta_1]) upp_pred = yy0_hat + t.ppf(1 - alpha / 2, n - p) * aux_t_pred low_pred = yy0_hat - t.ppf(1 - alpha / 2, n - p) * aux_t_pred plt.figure(figsize=(10, 5)) plt.plot(dat[:, 0], yy, 'o', label='Original data', markersize=5) plt.plot(x0, beta_0 + beta_1 * x0, 'r', label='Fitted line') plt.fill_between(x0, low_pred, upp_pred,
def getStudentVal(f3, q): return Decimal(abs(t.ppf(q / 2, f3))).quantize(Decimal('.0001')).__float__()
def CI_vs_samples(distributions, samples_list, samples_to_plot): avgs_nr = 10000 fig, ax = plt.subplots(3, 3, figsize=(16, 15)) results = [] for idx, distribution in enumerate(distributions): CI_df_cols = [ 'n', 'Normal', 'Exp. Sigma', 'Sigma', 'CI_norm', 'CI_norm_score', 'CI_t', 'CI_t_score' ] CI_df = pd.DataFrame(columns=CI_df_cols) real_avg = np.mean(random_data_avg(distribution, 1000, avgs_nr)) # Repeat for each fo the samples number for i, samples in enumerate(samples_list): data_avgs = [] CI_norm_score = [] CI_t_score = [] # Repeat 10k for each samples number for j in range(avgs_nr): # Generates random data data = random_data(distribution, samples) data_avg = np.mean(data) data_avgs.append(data_avg) # Computes the CI assuming normal data_std = np.std(data) CI = 1.96 * data_std / np.sqrt(samples) lower = data_avg - CI upper = data_avg + CI CI_norm_score.append(lower <= real_avg <= upper) # Computes the CI assuming t-distribution confidence = 0.95 std_err = sem(data) h = std_err * t.ppf((1 + confidence) / 2, samples - 1) lower = data_avg - h upper = data_avg + h CI_t_score.append(lower <= real_avg <= upper) # Plots the histogram if samples in samples_to_plot: label = "n = {}".format(samples) color = color_lin_gradient( np.array([1, 0, 0]), np.array([0.2, 0, 1]), len(samples_to_plot))[samples_to_plot.index(samples)] ax[idx, 0].hist(data_avgs, bins=50, label=label, color=color) qqplot(np.array(data_avgs), fit=True, line='45', ax=ax[idx, 1], label=label, color=color) # Computes the std deviation is_normal = normaltest(data_avgs)[-1] > 0.05 data_std = np.std(random_data(distribution, samples)) expected_avgs_std = data_std / np.sqrt(samples) real_avgs_std = np.std(data_avgs) # Update the series and add to the dataframe CI_series = pd.Series(index=CI_df_cols, data=[ samples, is_normal, expected_avgs_std, real_avgs_std, 2 * CI, np.mean(CI_norm_score), 2 * h, np.mean(CI_t_score) ]) CI_df = CI_df.append(CI_series, ignore_index=True) # Plots the graphs ax[idx, 0].set_xlabel("Value") ax[idx, 0].set_ylabel("Count") ax[idx, 0].set_xlim(0, 100) ax[idx, 0].legend() ax[idx, 1].legend() if distribution == 'exponential': ax[idx, 1].set_xlim(-4, 4) ax[idx, 1].set_ylim(-4, 6) ax[idx, 2].set_xlabel("n") ax[idx, 2].set_ylabel("CI score") ax[idx, 2].plot(samples_list, CI_df['CI_norm_score'].values, 'o', label="Normal Approx (Eq.2)", color="crimson") ax[idx, 2].plot(samples_list, CI_df['CI_t_score'].values, 'o', label="T-Distr Approx (Eq.3)", color="blue") ax[idx, 2].plot(samples_list, len(samples_list) * [0.95], '--', label='Theoretical CI', color='black') ax[idx, 2].set_xscale('log') ax[idx, 2].legend() results.append(CI_df) return results
def calculate_fdc( input_ts="-", columns=None, start_date=None, end_date=None, clean=False, skiprows=None, index_type="datetime", names=None, percent_point_function=None, plotting_position="weibull", source_units=None, target_units=None, sort_values="ascending", sort_index="ascending", add_index=False, include_sd=False, include_cl=False, ci=0.9, ): """Return the frequency distribution curve.""" sort_values = bool(sort_values == "ascending") tsd = tsutils.common_kwds( tsutils.read_iso_ts(input_ts, skiprows=skiprows, names=names, index_type=index_type), start_date=start_date, end_date=end_date, pick=columns, source_units=source_units, target_units=target_units, clean=clean, ) ppf = tsutils.set_ppf(percent_point_function) newts = pd.DataFrame() for col in tsd: tmptsd = tsd[col].dropna() if len(tmptsd) > 1: xdat = ppf( tsutils.set_plotting_position(tmptsd.count(), plotting_position)) tmptsd.sort_values(ascending=sort_values, inplace=True) tmptsd.index = xdat * 100 tmptsd = pd.DataFrame(tmptsd) if include_sd is True or include_cl is True: sd = (xdat * (1 - xdat) / len(xdat))**0.5 if include_sd is True: tmptsd[col + "_sd"] = sd if include_cl is True: tval = t.ppf(ci, df=len(xdat) - 1) ul = 2 * (1 - xdat) * tval * sd ll = 2 * xdat * tval * sd tmptsd[col + "_ul"] = (xdat + ul) * 100 tmptsd[col + "_ll"] = (xdat - ll) * 100 tmptsd[col + "_vul"] = tmptsd[col] + ul * tmptsd[col] tmptsd[col + "_vll"] = tmptsd[col] - ll * tmptsd[col] else: tmptsd = pd.DataFrame() newts = newts.join(tmptsd, how="outer") newts.index.name = "Plotting_position" newts = newts.groupby(newts.index).first() if sort_index == "descending": return newts.iloc[::-1] if add_index is True: newts.reset_index(inplace=True) return newts
]) / N b_1 = sum([ globals()['y' + str(i + 1) + '_abs'] * matrix[i][1] for i in range(len(matrix)) ]) / N b_2 = sum([ globals()['y' + str(i + 1) + '_abs'] * matrix[i][2] for i in range(len(matrix)) ]) / N b_3 = sum([ globals()['y' + str(i + 1) + '_abs'] * matrix[i][3] for i in range(len(matrix)) ]) / N f3 = f1 * f2 t_kr = t.ppf(df=f3, q=(1 + 0.95) / 2) d = 0 t = [abs(globals()['b_' + str(i)]) / s_beta for i in range(N)] for i in range(len(t)): if t[i] < t_kr: t[i] = 0 else: t[i] = 1 d += 1 print("b{} is unnecessary".format([i for i in range(len(t)) if t[i] == 0])) # Equations with valuable coefficients
def student_value(f3, significance): return Decimal(abs(t.ppf(significance / 2, f3))).quantize(Decimal('.0001')).__float__()
# We compute the coeff. B, C, D = np.linalg.lstsq(XX, yy, rcond=None)[0] params = [B, C, D] # Calculate the SSE SSE = np.linalg.lstsq(XX, yy, rcond=None)[1] # We get confidence interval alpha = 0.05 x0 = np.linspace(-1.5, 3.5, 50) X0 = np.vstack([x0**2, x0, np.ones(len(x0))]).T aux_t_conf = np.sqrt(SSE / (n - p) * (np.diag(X0 @ np.linalg.inv(XX.T @ XX) @ X0.T))) yy0_hat = X0 @ np.array(params) upp_conf = yy0_hat + t.ppf(1 - alpha / 2, n - p) * aux_t_conf low_conf = yy0_hat - t.ppf(1 - alpha / 2, n - p) * aux_t_conf # We get prediction interval aux_t_pred = np.sqrt(SSE / (n - p) * (1 + np.diag(X0 @ np.linalg.inv(XX.T @ XX) @ X0.T))) yy0_hat = X0 @ np.array(params) upp_pred = yy0_hat + t.ppf(1 - alpha / 2, n - p) * aux_t_pred low_pred = yy0_hat - t.ppf(1 - alpha / 2, n - p) * aux_t_pred plt.figure(figsize=(7.5, 7.5)) plt.plot(points_x, points_y, 'o', label='Original data', markersize=5) plt.plot(x0, B * x0**2 + C * x0 + D, 'r', label='Fitted parabola') plt.fill_between(x0, low_pred, upp_pred,
def find_critvals(n: int, r: int, alpha: float) -> list: """Computes critical values :math:`\lambda_i` for the generalized extreme Studentized deviate (ESD) test. Parameters ---------- n: Number of data points. r: Maximum number of outliers. alpha: Significance level for the statistical test. Returns ------- : Critical values. Notes ----- The :math:`\lambda_i` values are calculated as follows: .. math:: \lambda_i = \\frac{ (n-i)\ t_{p, n-i-1} }{ \sqrt{(n-i-1-t_{n-i-1}^2)(n-i+1)} } \quad i \in \{1,2, \dots, r \} .. math:: p = 1 - \\frac{\\alpha}{2(n-i+1)} Where - :math:`n` : number of points in the array. - :math:`\\alpha` : significance level. - :math:`t_{p,v}` : percent point function of the t-distribution at :math:`p` value and :math:`v` degrees of freedom. - :math:`r` : maximum number of outliers. Example ------- >>> from araucaria.stats import find_critvals >>> n = 54 # number of points >>> r = 5 # max number of outliers >>> alpha = 0.05 # significance level >>> lambd = find_critvals(n, r, alpha) >>> for val in lambd: ... print('%1.3f' % val) 3.159 3.151 3.144 3.136 3.128 """ critvals = [] # container for critical values for i in range(1, r + 1): p = 1 - (alpha / (2 * (n - i + 1))) # finds t value corresponding to probability that # sample within data set is itself an outlying point tval = t.ppf(p, n - i - 1) val = ((n - i) * tval) / (((n - i - 1 + (tval**2)) * (n - i + 1))**(1 / 2)) critvals.append(val) return critvals
import numpy as np from scipy.stats import norm, chi2, t #print("Лабораторная работа №8; Выполнила: Фомина Дарья\n") selections = [norm.rvs(size=20), norm.rvs(size=100)] gamma = 0.95 for sel in selections: xm = np.mean(sel) s = np.sqrt(np.mean(sel*sel) - xm**2) n = len(sel) print(f'\n\tРазмер: {n}') ct = t.ppf((1 + gamma) / 2, n - 1) chi_low = chi2.ppf((1 + gamma) / 2, n - 1) chi_high = chi2.ppf((1 - gamma) / 2, n - 1) print('\n\tКлассические интервальные оценки') dx = s*ct*(n - 1)**(-0.5) print(f'm in ({xm-dx}; {xm+dx})') print(f's in ({s*(n/chi_low)**(0.5)}; {s*(n/chi_high)**(0.5)})') print('\n\tАсимптотические интервальные оценки') cu = norm.ppf((1 + gamma) / 2) dx = s * cu *(n**(-0.5)) m4 = np.mean((sel - xm)**4) e = m4/(s**4) - 3 U = cu*np.sqrt((e+2)/n) print(f'm in ({xm-dx}; {xm+dx})') print(f's in ({s*(1+U)**(-0.5)}; {s*(1-U)**(-0.5)})')
def get_tdist_hw(x): n = len(x) return tDist.ppf(1 - alpha / 2.0, n - 1) * np.std(x) / np.sqrt(n)
def filter_cells(adata: AnnData, device="cpu", p_level=None, subset=True, plot=False, copy=False): """\ Filter cells using on gene/molecule relationship. Code has been translated from pagoda2 R function gene.vs.molecule.cell.filter. Parameters ---------- adata Annotated data matrix. device Run gene and molecule counting on either `cpu` or on `gpu`. p_level Statistical confidence level for deviation from the main trend, used for cell filtering (default=min(1e-3,1/adata.shape[0])) subset if False, add a column `outlier` in adata.obs, otherwise subset the adata. plot Plot the molecule distribution and the gene/molecule dependency fit. copy Return a copy instead of writing to adata. Returns ------- adata : anndata.AnnData if `copy=True` and `subset=True` it returns subsetted (removing outliers) or else add fields to `adata`: `.obs['outlier']` whether a cell is an outlier. """ adata = adata.copy() if copy else adata logg.info("Filtering cells", reset=True) X = adata.X.copy() logg.info(" obtaining gene and molecule counts") if device == "cpu": log1p_total_counts = np.log1p(np.array(X.sum(axis=1))).ravel() X.data = np.ones_like(X.data) log1p_n_genes_by_counts = np.log1p(np.array(X.sum(axis=1))).ravel() elif device == "gpu": import cupy as cp from cupyx.scipy.sparse import csr_matrix as csr_matrix_gpu X = csr_matrix_gpu(X) log1p_total_counts = cp.log1p(X.sum(axis=1)).get().ravel() X.data = cp.ones_like(X.data) log1p_n_genes_by_counts = cp.log1p(X.sum(axis=1)).get().ravel() df = pd.DataFrame( { "log1p_total_counts": log1p_total_counts, "log1p_n_genes_by_counts": log1p_n_genes_by_counts, }, index=adata.obs_names, ) logg.info(" fitting RLM") rlm_model = sm.RLM.from_formula( "log1p_n_genes_by_counts ~ log1p_total_counts", df, ).fit() p_level = min(1e-3, 1 / adata.shape[0]) if p_level is None else p_level SSE_line = ((df.log1p_n_genes_by_counts - rlm_model.predict())**2).sum() MSE = SSE_line / df.shape[0] z = t.ppf((p_level / 2, 1 - p_level / 2), df.shape[0]) se = np.zeros(df.shape[0]) get_SE(MSE, df.log1p_total_counts.values, se) pr = pd.DataFrame( { 0: rlm_model.predict(), 1: rlm_model.predict() + se * z[0], 2: rlm_model.predict() + se * z[1], }, index=adata.obs_names, ) logg.info(" finished", time=True, end=" " if settings.verbosity > 2 else "\n") outlier = (df.log1p_n_genes_by_counts < pr[1]) | (df.log1p_n_genes_by_counts > pr[2]) if plot: fig, ax = plt.subplots() idx = df.sort_values("log1p_total_counts").index ax.fill_between( df.log1p_total_counts[[idx[0], idx[-1]]], pr[1][[idx[0], idx[-1]]], pr[2][[idx[0], idx[-1]]], color="yellow", alpha=0.3, ) df.loc[~outlier].plot.scatter(x="log1p_total_counts", y="log1p_n_genes_by_counts", c="k", ax=ax, s=1) df.loc[outlier].plot.scatter(x="log1p_total_counts", y="log1p_n_genes_by_counts", c="grey", ax=ax, s=1) if subset: adata._inplace_subset_obs(adata.obs_names[~outlier]) logg.hint("subsetted adata.") else: adata.obs["outlier"] = outlier logg.hint("added \n" " .obs['outlier'], boolean column indicating outliers.") return adata if copy else None
fplot = norm.pdf(xplot,xquer,s) fig = plt.figure(1, figsize=(12, 4)) ax1, ax2 = fig.subplots(1,2) ax1.hist(X, 10, density=True, facecolor='b') ax1.plot(xplot,fplot,'r') ax1.set_xlabel('Volumenstrom Q / m³/h') ax1.set_ylabel('Wahrscheinlichkeitsdichte') ax1.grid(False) ax1.axis([0.48,0.54,0,80]) ax2.boxplot(X) ax2.set_ylabel('Volumenstrom Q / m³/h') """ Berechnung und Ausgabe der Parameter mit Konfidenzbereich """ gamma = 0.95 c1 = t.ppf((1-gamma)/2,N-1) c2 = t.ppf((1+gamma)/2,N-1) mu = round(xquer,3) muc1 = round(xquer - c2*s/np.sqrt(N),3) muc2 = round(xquer - c1*s/np.sqrt(N),3) c1 = chi2.ppf((1-gamma)/2,N-1) c2 = chi2.ppf((1+gamma)/2,N-1) sig = round(s,3) sigc1 = round(s*np.sqrt(N/c2),3) sigc2 = round(s*np.sqrt(N/c1),3) print(' ') print('Konfidenzbereiche') print('Mittelwert : ', muc1, '<=', mu, '<=', muc2) print('Standardabweichung : ', sigc1, '<=', sig, '<=', sigc2) """ Durchführung Hypothesentest """
print("\t Standard Error of the Mean = {:.5f}".format(std_err1)) # standard error (SE) of mean of sample2 std_err2 = s2_stdv / np.sqrt(n2) print("\nSample 2: \n\t Number of Observations = {} \n\t Mean = {:.5f}".format( n2, s2_mean)) print("\t Standard Deviation = {:.5f}".format(s2_stdv)) print("\t Standard Error of the Mean = {:.5f}".format(std_err2)) # calculation of t-statistic and degrees of freedom tstatistic, dof, sp = ttest_and_variance(s1_stdv, s2_stdv) print("\nt-statistic: {:.5f}".format(tstatistic)) # calculation of Critical values tcritical_l = t.ppf(q=los / 2, df=dof) tcritical_u = -tcritical_l print("\nCritical values are {:.5f}, {:.5f}".format(tcritical_l, tcritical_u)) # decision making: t-statistic and Critical values if tstatistic < tcritical_l or tstatistic > tcritical_u: print("Reject the Null hypothesis.") else: print("Fail to reject the Null hypothesis.") # calculation of p-value pvalue = 2 * t.cdf(tstatistic, df=dof) print("\np-value: {:.5f}".format(pvalue)) # decision making: p-value and level of significance if pvalue < los: print("Reject the Null hypothesis.")