def statistic_tests(name_of_file): tab1 = [] tab2 = [] list_of_rows = read_file(name_of_file) fill_tables(tab1, tab2, list_of_rows) print('Rank-Sum') print('ranksum column 1:', rank_sum(tab1), 'column 2:', rank_sum(tab2)) print('Kruskal') print(kruskal(tab1, tab2)) print('ANOVA') print(f_oneway(tab1, tab2)) print('Brunner') print(brunnermunzel(tab1, tab2)) print('Whitney') print(mannwhitneyu(tab1, tab2)) print('Barlet') print(barlet_test(tab1, tab2)) print('Levene') print(levene_test(tab1, tab2)) print('Shapiro') print('shapiro column 1:', shapiro(tab1), 'column 2:', shapiro(tab2)) print('T-Student') print(ttest_ind(tab1, tab2)) print('Lilliefors') print('liliefors', 'column 1:', lilliefors(tab1), 'column 2:', lilliefors(tab2))
def test_normality(self): res = self.res #> library(nortest) #Lilliefors (Kolmogorov-Smirnov) normality test #> lt = lillie.test(residuals(fm)) #> mkhtest(lt, "lilliefors", "-") lilliefors1 = dict(statistic=0.0723390908786589, pvalue=0.01204113540102896, parameters=(), distr='-') #> lt = lillie.test(residuals(fm)**2) #> mkhtest(lt, "lilliefors", "-") lilliefors2 = dict(statistic=0.301311621898024, pvalue=1.004305736618051e-51, parameters=(), distr='-') #> lt = lillie.test(residuals(fm)[1:20]) #> mkhtest(lt, "lilliefors", "-") lilliefors3 = dict(statistic=0.1333956004203103, pvalue=0.455683, parameters=(), distr='-') lf1 = smsdia.lilliefors(res.resid) lf2 = smsdia.lilliefors(res.resid**2) lf3 = smsdia.lilliefors(res.resid[:20]) compare_t_est(lf1, lilliefors1, decimal=(14, 14)) compare_t_est(lf2, lilliefors2, decimal=(14, 14)) # pvalue very small assert_allclose(lf2[1], lilliefors2['pvalue'], rtol=1e-10) compare_t_est(lf3, lilliefors3, decimal=(14, 1)) # R uses different approximation for pvalue in last case #> ad = ad.test(residuals(fm)) #> mkhtest(ad, "ad3", "-") adr1 = dict(statistic=1.602209621518313, pvalue=0.0003937979149362316, parameters=(), distr='-') #> ad = ad.test(residuals(fm)**2) #> mkhtest(ad, "ad3", "-") adr2 = dict(statistic=np.inf, pvalue=np.nan, parameters=(), distr='-') #> ad = ad.test(residuals(fm)[1:20]) #> mkhtest(ad, "ad3", "-") adr3 = dict(statistic=0.3017073732210775, pvalue=0.5443499281265933, parameters=(), distr='-') ad1 = smsdia.normal_ad(res.resid) compare_t_est(ad1, adr1, decimal=(11, 13)) ad2 = smsdia.normal_ad(res.resid**2) assert_(np.isinf(ad2[0])) ad3 = smsdia.normal_ad(res.resid[:20]) compare_t_est(ad3, adr3, decimal=(11, 12))
def check_normality(): '''Check if the distribution is normal.''' # Set the parameters numData = 1000 myMean = 0 mySD = 3 # To get reproducable values, I provide a seed value np.random.seed(1234) # Generate and show random data data = stats.norm.rvs(myMean, mySD, size=numData) fewData = data[:100] plt.hist(data) plt.show() # --- >>> START stats <<< --- # Graphical test: if the data lie on a line, they are pretty much # normally distributed _ = stats.probplot(data, plot=plt) plt.show() pVals = pd.Series() pFewVals = pd.Series() # The scipy normaltest is based on D-Agostino and Pearsons test that # combines skew and kurtosis to produce an omnibus test of normality. _, pVals['Omnibus'] = stats.normaltest(data) _, pFewVals['Omnibus'] = stats.normaltest(fewData) # Shapiro-Wilk test _, pVals['Shapiro-Wilk'] = stats.shapiro(data) _, pFewVals['Shapiro-Wilk'] = stats.shapiro(fewData) # Or you can check for normality with Lilliefors-test _, pVals['Lilliefors'] = lilliefors(data) _, pFewVals['Lilliefors'] = lilliefors(fewData) # Alternatively with original Kolmogorov-Smirnov test _, pVals['Kolmogorov-Smirnov'] = stats.kstest( (data - np.mean(data)) / np.std(data, ddof=1), 'norm') _, pFewVals['Kolmogorov-Smirnov'] = stats.kstest( (fewData - np.mean(fewData)) / np.std(fewData, ddof=1), 'norm') print('p-values for all {0} data points: ----------------'.format( len(data))) print(pVals) print('p-values for the first 100 data points: ----------------') print(pFewVals) if pVals['Omnibus'] > 0.05: print('Data are normally distributed') # --- >>> STOP stats <<< --- return pVals['Kolmogorov-Smirnov']
def check_normality(data: np.ndarray, show_flag: bool = True) -> List[float]: """Check if the distribution is normal Parameters ---------- data : vector of data to be tested show_flag : controls the display of data Returns ------- ps : List of p-values for different normality tests """ few_data = data[::10] # --- >>> START stats <<< --- # Graphical test: if the data lie on a line, they are pretty much # normally distributed if show_flag: _ = stats.probplot(data, plot=plt) plt.show() pVals = pd.Series() pFewVals = pd.Series() # The scipy normaltest is based on D-Agostino and Pearsons test that # combines skew and kurtosis to produce an omnibus test of normality. _, pVals['Omnibus'] = stats.normaltest(data) _, pFewVals['Omnibus'] = stats.normaltest(few_data) # Shapiro-Wilk test _, pVals['Shapiro-Wilk'] = stats.shapiro(data) _, pFewVals['Shapiro-Wilk'] = stats.shapiro(few_data) # Or you can check for normality with Lilliefors-test _, pVals['Lilliefors'] = lilliefors(data) _, pFewVals['Lilliefors'] = lilliefors(few_data) # Alternatively with original Kolmogorov-Smirnov test _, pVals['Kolmogorov-Smirnov'] = \ stats.kstest((data-np.mean(data))/np.std(data,ddof=1), 'norm') _, pFewVals['Kolmogorov-Smirnov'] = \ stats.kstest((few_data-np.mean(few_data))/np.std(few_data,ddof=1), 'norm') print(f'p-values for all {len(data)} data points: ----------------') print(pVals) print('p-values for the first 100 data points: ----------------') print(pFewVals) if pVals['Omnibus'] > 0.05: print('Data are normally distributed') # --- >>> STOP stats <<< --- return pVals
def test_normality(self): res = self.res #> library(nortest) #Lilliefors (Kolmogorov-Smirnov) normality test #> lt = lillie.test(residuals(fm)) #> mkhtest(lt, "lilliefors", "-") lilliefors1 = dict(statistic=0.0723390908786589, pvalue=0.01204113540102896, parameters=(), distr='-') #> lt = lillie.test(residuals(fm)**2) #> mkhtest(lt, "lilliefors", "-") lilliefors2 = dict(statistic=0.301311621898024, pvalue=1.004305736618051e-51, parameters=(), distr='-') #> lt = lillie.test(residuals(fm)[1:20]) #> mkhtest(lt, "lilliefors", "-") lilliefors3 = dict(statistic=0.1333956004203103, pvalue=0.20, parameters=(), distr='-') lf1 = smsdia.lilliefors(res.resid) lf2 = smsdia.lilliefors(res.resid**2) lf3 = smsdia.lilliefors(res.resid[:20]) compare_t_est(lf1, lilliefors1, decimal=(14, 14)) compare_t_est(lf2, lilliefors2, decimal=(14, 14)) # pvalue very small assert_allclose(lf2[1], lilliefors2['pvalue'], rtol=1e-10) compare_t_est(lf3, lilliefors3, decimal=(14, 1)) # R uses different approximation for pvalue in last case #> ad = ad.test(residuals(fm)) #> mkhtest(ad, "ad3", "-") adr1 = dict(statistic=1.602209621518313, pvalue=0.0003937979149362316, parameters=(), distr='-') #> ad = ad.test(residuals(fm)**2) #> mkhtest(ad, "ad3", "-") adr2 = dict(statistic=np.inf, pvalue=np.nan, parameters=(), distr='-') #> ad = ad.test(residuals(fm)[1:20]) #> mkhtest(ad, "ad3", "-") adr3 = dict(statistic=0.3017073732210775, pvalue=0.5443499281265933, parameters=(), distr='-') ad1 = smsdia.normal_ad(res.resid) compare_t_est(ad1, adr1, decimal=(11, 13)) ad2 = smsdia.normal_ad(res.resid**2) assert_(np.isinf(ad2[0])) ad3 = smsdia.normal_ad(res.resid[:20]) compare_t_est(ad3, adr3, decimal=(11, 12))
def normTest(self, p=0.05): # D'Agostino-Pearson Test, sample size 20-50 if 20 < len(self.data) <= 50: p_value = stats.normaltest(self.data)[1] name = 'normaltest (D Agostino-Pearson)' elif len(self.data) <= 20: p_value = stats.shapiro(self.data)[1] name = 'shapiro' elif 300 >= len(self.data) >= 50: # Hubert Lilliefors p_value = lilliefors(self.data) name = 'lillifors' elif len(self.data) > 300: p_value = stats.kstest(self.data, 'norm')[1] name = 'KStest' print('-' * 10, ' NORMAL TEST ', '-' * 10) if p_value < p: print("USE: ", name) print("Conclusion: data are not normally distributed") return False else: print("USE: ", name) print("Conclusion: data are normally distributed") return True
def normal_test(sample, alpha=0.05, verbose=False): # hypothesis test: null hypothesis, the data is gaussian distributed # Shapiro-Wilk stat, p = shapiro(sample) if verbose: if p > alpha: print('Shapiro this is Gaussian', p) else: print('Shapiro this is NOT Gaussian', p) # chisquare stat, p = chisquare(sample) if verbose: if p > alpha: print('Chisquare this is Gaussian', p) else: print('Chisquare this is NOT Gaussian', p) # lilliefors stat, p = lilliefors(sample) if verbose: if p > alpha: print('Lilliefors this is Gaussian', p) else: print('Lilliefors this is NOT Gaussian', p) # kolmogorov stat, p = kstest(sample, 'norm') if verbose: if p > alpha: print('Kolmogorov this is Gaussian', p) else: print('Kolmogorov this is NOT Gaussian', p) # Angostino k2, p = normaltest(sample) if verbose: if p > alpha: print('Angostino this is Gaussian', p) else: print('Angostino this is NOT Gaussian', p) return p, alpha
def load_analysis(data, L, method, plot=True): daily_errors = [] for day, day_df in data.groupby(data.date.dt.day): daily_errors.append(calculate_daily_error(L, day_df[L.column], method)) daily_errors = np.asarray(daily_errors) error_df = pd.DataFrame() for step in range(daily_errors.shape[2]): step_err = daily_errors[:, :, step].flatten() error_df[step + 1] = step_err if plot: print("*" * 100) print( "Started load analysis with N={}, signal = {}, method = {}".format( L.N, L.column, method.__name__)) print("*" * 100 + "\n") print("****** Statistics ******") print(error_df.describe()) print("\n" + "*" * 30 + " Lilliefors " + "*" * 30) print( "Lilliefors test-statistic:", lilliefors(daily_errors.flatten(), dist="norm")[1], ) estimate_rmse(error_df) plot_predictions(L, method) plot_boxplot(error_df, method.__name__) plot_daily_errors(daily_errors, method.__name__) plot_error_hist(daily_errors, method.__name__) plt.show() return error_df
def norm_test(self): # D'Agostino-Pearson Test, sample size 20-50 if 20 < len(self.data) <= 50: p_value = stats.normaltest(self.data)[1] name = 'normaltest' elif len(self.data) <= 20: p_value = stats.shapiro(self.data)[1] name = 'shapiro' elif 300 >= len(self.data) >= 50: # Hubert Lilliefors p_value = lilliefors(self.data) name = 'lillifors' elif len(self.data) > 300: p_value = stats.kstest(self.data, 'norm')[1] name = 'KSTEST' if p_value < 0.05: print "USE ", name print "DATA ARE NOT NORMALLY DISTRIBUTED" return False else: print "USE ", name print "DATA ARE NORMALLY DISTRIBUTED" return True
def normality_test(self, test_type='ks'): """ Perform normality tests for all included variables. Parameters ---------- test_type : str Which normality test to use. Available values: 'ks' (Kolmogorov-Smirnov's test) or 'sw' (Shapiro-Wilk' test) """ if test_type not in ['ks', 'sw']: raise ValueError( "Unknown normality test type. Possible values: 'ks' (Kolmogorov-Smirnov) ans 'sw' (Shapiro-Wilk)" ) results = {} for var in self._variables: ser = self._data[var] if test_type == 'ks': stat, pval = lilliefors(ser.dropna(), pvalmethod='approx') elif test_type == 'sw': stat, pval = shapiro(ser.dropna()) results.update({var: [stat, pval]}) results = pd.DataFrame(results, index=['statistic', 'p-value']) return results.T
def compute_and_print_lilliefors(all_data_raw, label, p_value): # Get last fitness train values samples = all_data_raw[:, -1] # Run lilliefors normality test stat, pval = lilliefors(samples) if pval < p_value: print(f'{label} IS NOT normally distributed (p={pval})') else: print(f'{label} IS normally distributed (p={pval})') return samples
def fit(self, x, dist='norm', pvalmethod='table'): """Perform the Shapiro-Wilk test for normality. Parameters ---------- x : array_like, 1d Data to test. """ self._statistic, self._p = lilliefors(x, dist=dist, pvalmethod=pvalmethod)
def testy_norm(lista_gestosci, indeks): print('Shapiro-Wilk') # shapiro-wilk, nie sa normalne print(dane_10_lat[indeks][0], stats.shapiro(lista_gestosci[indeks])) print('Lilliefors') # shapiro-wilk, nie sa normalne print(dane_10_lat[indeks][0], lilliefors(lista_gestosci[indeks])) print('D’Agostino’s K^2 Test') # D’Agostino’s K^2 Test, Sample looks Gaussian (fail to reject H0) print(dane_10_lat[indeks][0], stats.normaltest(lista_gestosci[indeks])) print('Anderson-Darling test') print(dane_10_lat[indeks][0], stats.anderson(lista_gestosci[indeks]), 'norm')
def check_normality(testData, alpha=0.05): # 20<样本数<50用normal test算法检验正态分布性 if 20 < len(testData) < 50: normaltest_statistic, normaltest_p = stats.normaltest( testData ) # https://docs.scipy.org/doc/scipy-0.19.0/reference/generated/scipy.stats.normaltest.html print(normaltest_statistic, normaltest_p) if normaltest_p < alpha: print('use normaltest') print('data are not normal distributed') return False else: print('use normaltest') print('data are normal distributed') return True # 样本数小于50用Shapiro-Wilk算法检验正态分布性 if len(testData) < 50: shapiro_statistic, shapiro_p = stats.shapiro( testData ) # Perform the Shapiro-Wilk test for normality. https://docs.scipy.org/doc/scipy-0.18.1/reference/generated/scipy.stats.shapiro.html print(shapiro_statistic, shapiro_p) if shapiro_p < alpha: print("use shapiro:") print("data are not normal distributed") return False else: print("use shapiro:") print("data are normal distributed") return True if 300 >= len(testData) >= 50: lilliefors_statistic, lilliefors_p = lilliefors( testData ) # https://blog.csdn.net/qq_20207459/article/details/103000285 print(lilliefors_statistic, lilliefors_p) if lilliefors_p < alpha: print("use lillifors:") print("data are not normal distributed") return False else: print("use lillifors:") print("data are normal distributed") return True if len(testData) > 300: kstest_statistic, kstest_p = scipy.stats.kstest(testData, 'norm') print(kstest_statistic, kstest_p) if kstest_p < alpha: print("use kstest:") print("data are not normal distributed") return False else: print("use kstest:") print("data are normal distributed") return True
def secondtextchanged(self): try: xx = self.plainTextEdit_11.toPlainText() xx = xx.split() xa = [float(x) for x in xx] from scipy import stats from scipy.stats import shapiro stat1, p1 = shapiro(xa) #print('Statistics=%.3f, p=%.3f' % (stat, p)) from scipy.stats import normaltest stat2, p2 = normaltest(xa) #print('Statistics=%.3f, p=%.3f' % (stat, p)) from scipy.stats import chisquare stat3, p3 = chisquare(xa) from statsmodels.stats.diagnostic import lilliefors stat4, p4 = lilliefors(xa) from scipy.stats import jarque_bera stat5, p5 = jarque_bera(xa) from scipy.stats import kstest stat6, p6 = kstest(xa, "norm") from scipy.stats import skew val1 = round(skew(xa), 3) from scipy.stats import kurtosis val2 = round(kurtosis(xa), 3) import statistics mm = f'{round(statistics.mean(xa), 3)} ± {round(statistics.stdev(xa), 3)}' text = f"Count of data is {len(xa)}\n\n" text += f"Mean ± standard deviation: {mm}\n\n" text += f"skewness = {val1}\n" text += f"kurtosis = {val2}\n\n" text += f"Shapiro-Wilk Test:\nstat= {round(stat1, 4)}, p-value= {round(p1, 4)}\n\n" text += f"D’Agostino’s K-squared test:\nstat= {round(stat2, 4)}, p-value= {round(p2, 4)}\n\n" # text += f"Chi-Square Normality Test:\nstat= {round(stat3, 4)}, p-value= {round(p3, 4)}\n\n" text += f"Lilliefors Test for Normality:\nstat= {round(stat4, 4)}, p-value= {round(p4, 4)}\n\n" text += f"Jarque–Bera test for Normality:\nstat= {round(stat5, 4)}, p-value= {round(p5, 4)}\n\n" # text += f"Kolmogorov-Smirnov test for Normality:\nstat= {round(stat6, 4)}, p-value= {round(p6, 4)}\n\n" self.plainTextEdit_12.setPlainText(text) except Exception as e: print(e) QMessageBox.warning(self, "Warning", f"The output not obtained because {e}") return QMessageBox.information(self, "Information", "The output data generated successfully")
def preprocess(self, test_size=0.3): # cleaning the data df = self.raw_df.dropna().drop_duplicates() if len(df) > 365: self.step = "W" else: self.step = "D" # aggregating data by daily sales df = df.resample(self.step).apply(sum) self.df = df.reset_index() self.df.columns = ["ds", "y"] self.index = int(len(self.df)*test_size) # ---------------------------------------- #---------- Test of normality ------------ # ---------------------------------------- # Lilliefors Test self.lilliefors_D, p = lilliefors(self.df.y) #Kolmogorov-Smirnov Goodness of Fit Test statistic at 0.05% significance self.KS_stat_05 = 1.36 / len(self.df)**0.5 if self.lilliefors_D > self.KS_stat_05: print("[ The H0 normality hypothesis at alpha = 0.05 is rejected ]") print("[ Lilliefors test statistic: {:.5f}, Kolmogorov-Smirnov ".format(self.lilliefors_D) + "critical value: {:.5f} ]".format(self.KS_stat_05)) self.normalize = True # Box-Cox transformation if self.normalize: self.df = self.df[self.df.y > 0] x, self.optimal_lambda = stats.boxcox(self.df.y[:-self.index]) print("[ Applying Box-Cox Transformation. Optimal lambda: {:.5f} ]".format(self.optimal_lambda)) self.df.y = stats.boxcox(self.df.y, self.optimal_lambda)
def lilliefors_kolmogorov(data): """ Test assumed normal or exponential distribution using Lilliefors’ test. Lilliefors’ test is a Kolmogorov-Smirnov test with estimated parameters. Parameters ---------- data : Array of sample data. Returns ------- p-value : The p-value of the test. """ stat, p_value = lilliefors(data) return p_value
def check_normality(data): kolmogorov_data = kstest(data, 'norm') shapiro_data = shapiro(data) lilliefors_data = lilliefors(data) df = len(data) return { 'Kolmogorov-Smirnov': { 'statistic': kolmogorov_data.statistic, 'df': df, 'pvalue': kolmogorov_data.pvalue }, 'Lilliefors': { 'statistic': lilliefors_data[0], 'df': df, 'pvalue': lilliefors_data[1] }, 'Shapiro-Wilk': { 'statistic': shapiro_data.statistic, 'df': df, 'pvalue': shapiro_data.pvalue } }
def fit(self, x, dist='norm', pvalmethod='table'): """Performs the statistical test. Parameters ---------- x : array_like, 1d Data to test. dist : {‘norm’, ‘exp’}, optional The assumed distribution. pvalmethod : {‘approx’, ‘table’}, optional The method used to compute the p-value of the test statistic. In general, ‘table’ is preferred and makes use of a very large simulation. ‘approx’ is only valid for normality. if dist = ‘exp’ table is always used. ‘approx’ uses the approximation formula of Dalal and Wilkinson, valid for pvalues < 0.1. If the pvalue is larger than 0.1, then the result of table is returned. """ self._statistic, self._p = lilliefors(x, dist=dist, pvalmethod=pvalmethod)
sns.boxplot(x=enem['TP_SEXO'], y=enem['NU_NOTA_MT']) plt.xlabel("") plt.ylabel("Nota de Matemática") plt.show() from scipy import stats from statsmodels.stats import diagnostic sexo = enem[['TP_SEXO', 'NU_NOTA_MT']] sexo_f = sexo.query('TP_SEXO == "F"').drop('TP_SEXO',axis=1).dropna() sexo_m = sexo.query('TP_SEXO == "M"').drop('TP_SEXO',axis=1).dropna() print(sexo_f.shape[0]) print(sexo_m.shape[0]) print('sexo_f:',diagnostic.lilliefors(sexo_f)) print('sexo_m:',diagnostic.lilliefors(sexo_m)) stats.mannwhitneyu(sexo_f, sexo_m, alternative='two-sided') sns.boxplot(x=enem['Q025'], y=enem['NU_NOTA_MT']) plt.xlabel("Tem internet em casa?") plt.ylabel("Nota de Matemática") plt.show() internet = enem[['Q025', 'NU_NOTA_MT']] internet_n = internet.query('Q025 == "Não"').drop('Q025',axis=1).dropna() internet_s = internet.query('Q025 == "Sim"').drop('Q025',axis=1).dropna() print(internet_n.shape[0]) print(internet_s.shape[0])
columnData, alternative='greater') if print_pval == 1: print('Statistic = %.5f, p=%.20f' % (stat, p)) # p_total = p_total+p alpha = alpha_c #/100 #apply bonferroni correction for 100 tests if p > alpha: print( 'Samples are not significantly different (fail to reject H0)' ) else: print('Samples are significantly different (reject H0)') #check if individual distributions are normally distributed with lilliefors stat_lillie, p_lillie = lilliefors(columnData, pvalmethod='table') # print('Statistics=%.3f, p=%.3f' % (stat, p)) # interpret if p_lillie > 0.05: distribution = 'normal' #print('world ' + layer + ' looks Gaussian (fail to reject H0)') else: distribution = 'not normal' #print('world ' + layer + ' does not look Gaussian (reject H0)') #calculate medians of each random sample set medians = statistics.median(columnData) #store p values and statistic in a dataframe, with names of the compared layers if '139' in variable_geopark: layer = variable_geopark.replace('_stats_139', '') else:
# estatistica mu, std = scs.norm.fit(lwt) # Plot the PDF. xmin, xmax = pl.xlim() x = np.linspace(xmin, xmax, 100) p = scs.norm.pdf(x, mu, std) pl.plot(x, p, 'r--', linewidth=2) # Teste de hipotese de normalidade com 5% de significancia: # H0: A amostra provem de uma população normal # H1: A amostra nao provem de uma distribuicao normal # Testes de shapiro e lillefors: s = scs.shapiro(lwt) lil = lilliefors(lwt) pl.text(225, 0.018, 'Shapiro: '+str(round(s[1], 5) )+'\nLilliefors: '+str(round(lil[1], 5)), bbox=dict(facecolor='red', alpha=0.4), zorder=4 ) pl.savefig("imgs/lowbw/ajuste_normal_lwt.pdf") pl.show() bwt.hist(histtype='bar', density=True, ec='black', zorder=2) pl.xticks(range(709, 5010, 428)) pl.xlabel("Peso da mãe na época da última menstruação (lb)") pl.ylabel("Frequência") pl.title("Ajuste de modelo normal a variável bwt") pl.grid(axis='x') # pl.xticks(range(10,101,10))
def eval_lilliefors(data, name: str = ""): ksstat, pvalue = lilliefors(data, dist="norm") print_hypothesis(alpha, name, ksstat, pvalue)
def plot_waveforms_wavelet_tranform(waveforms, cluster_ids=None, save_file=None, n_pc=4): all_waves = np.vstack(waveforms) coeffs = pywt.wavedec(all_waves, 'haar', axis=1) all_coeffs = np.column_stack(coeffs) k_stats = np.zeros((all_coeffs.shape[1], )) p_vals = np.ones((all_coeffs.shape[1], )) for i, coef in enumerate(all_coeffs.T): if len(np.unique(coef)) == 1: # to avoid nans continue try: k_stats[i], p_vals[i] = lilliefors(coef, dist='norm') except ValueError: continue # pick best coefficients as ones that are least normally distributed # that is lowest p-values from Lilliefors K-S test idx = np.argsort(p_vals) best_coeffs = all_coeffs[:, idx[:n_pc]] data = [] for i, w in enumerate(waveforms): tmp = best_coeffs[:w.shape[0]] best_coeffs = best_coeffs[w.shape[0]:] data.append(tmp) if cluster_ids is None: cluster_ids = list(range(len(waveforms))) colors = [plt.cm.jet(x) for x in np.linspace(0, 1, len(waveforms))] pairs = list(it.combinations(range(n_pc), 2)) n_cols = 1 while np.power(n_cols, 2) < len(pairs): n_cols += 1 n_rows = int(np.ceil(len(pairs) / n_cols)) fig, ax = plt.subplots(nrows=n_rows, ncols=n_cols, figsize=(5 * (n_cols + 1), 5 * n_rows)) ax = ax.reshape(ax.size) for i, p in enumerate(pairs): for x, y, z in zip(data, cluster_ids, colors): ax[i].scatter(x[:, p[0]], x[:, p[1]], s=3, alpha=0.5, color=z, label=y, marker='o') ax[i].set_xlabel('Coefficient %i' % p[0]) ax[i].set_ylabel('Coefficient %i' % p[1]) handles, labels = ax[0].get_legend_handles_labels() if n_rows * n_cols > len(pairs): ax[-1].set_axis_off() ax[-1].legend(handles, labels, loc='center', shadow=True) else: idx = int(((n_cols * (n_rows - 1)) - 1) + np.ceil(n_cols / 2)) ax[idx].legend(handles, labels, ncol=len(pairs), loc='upper center', bbox_to_anchor=(0.5, -0.05), shadow=True) fig.suptitle('Wavelet transform coefficients') if save_file: fig.savefig(save_file) return None, None else: return fig, ax.reshape((n_rows, n_cols))
from scipy.stats import norm from statsmodels.stats.diagnostic import lilliefors my_data = norm.rvs(size=500) lilliefors(my_data)
def distribution_hist_outlier_trat(X, trat=False, val_trat='', folder_name=''): ''' histogramas com ajuste de um modelo normal trat: se verdadeiro, ativa o tratamento dos outliers e considera limiarespara os valores nos histogramas ''' if folder_name != '': try: os.mkdir('./imgs/' + folder_name) except: pass try: os.mkdir('./imgs/' + folder_name + '/hists') except: pass try: os.mkdir('./imgs/' + folder_name + '/hists/csv_trat') except: pass ini = True for atr in X.columns: atr_name = ''.join(atr.split()) pl.figure() if trat == True: try: Y = X[atr][(X[atr] > val_trat[atr][0]) & (X[atr] < val_trat[atr][1])] aux = X[atr][(X[atr] < val_trat[atr][0]) | (X[atr] > val_trat[atr][1])] aux.to_csv("./imgs/" + folder_name + "/hists/csv_trat/outliers_" + atr_name + "_" + str(val_trat[atr]) + ".csv", header=False) except: Y = X[atr] trat_tex = '_trat' else: Y = X[atr] trat_tex = '' Y.hist(histtype='bar', density=True, ec='black', zorder=2) min_ = int(round(Y.min() - 0.5)) max_ = int(round(Y.max() + 0.5)) # print(min_) # print(max_) # print(atr) step = round((max_ - min_) / 10 + 0.5) pl.xticks(range(min_, max_, max(1, step))) pl.xlabel(atr) pl.ylabel("Frequência") pl.title("Histograma " + atr) pl.grid(axis='x') # estatistica mu, std = scs.norm.fit(Y) # Plot the PDF. xmin, xmax = pl.xlim() x = np.linspace(xmin, xmax, 100) p = scs.norm.pdf(x, mu, std) pl.plot(x, p, 'r--', linewidth=2) #print(mu, std) #print(x) # Teste de hipotese de normalidade com 5% de significancia: # H0: A amostra provem de uma população normal # H1: A amostra nao provem de uma distribuicao normal # Testes de shapiro e lillefors: s = scs.shapiro(Y) lil = lilliefors(Y) ymin, ymax = pl.ylim() pl.text(xmin + xmin * 0.01, ymax - ymax * 0.12, 'Shapiro: ' + str(round(s[1], 5)) + '\nLilliefors: ' + str(round(lil[1], 5)), bbox=dict(facecolor='red', alpha=0.4), zorder=4) if ini == True: D = pd.DataFrame(Y.describe()) ini = False else: D.loc[list(Y.describe().index), atr] = Y.describe() D.loc['skewness', atr] = scs.skew(Y) D.loc['kurtosis', atr] = scs.kurtosis(Y, fisher=False) pl.tight_layout() pl.savefig("imgs/" + folder_name + "/hists/" + atr_name + "_" + trat_tex + ".png") # pl.show() pl.close() D.to_csv('imgs/' + folder_name + '/hists/descricao_resumo' + trat_tex + '.csv')
# Plot the PDF. xmin, xmax = pl.xlim() x = np.linspace(xmin, xmax, 100) p = scs.norm.pdf(x, mu, std) pl.plot(x, p, 'r--', linewidth=2) print(mu, std) print(x) # Teste de hipotese de normalidade com 5% de significancia: # H0: A amostra provem de uma população normal # H1: A amostra nao provem de uma distribuicao normal # Testes de shapiro e lillefors: s = scs.shapiro(Y) lil = lilliefors(Y) ymin, ymax = pl.ylim() pl.text(xmin + xmin * 0.01, ymax - ymax * 0.12, 'Shapiro: ' + str(round(s[1], 5)) + '\nLilliefors: ' + str(round(lil[1], 5)), bbox=dict(facecolor='red', alpha=0.4), zorder=4) pl.tight_layout() pl.savefig('teste_hipotese_' + Y.name + '.png') pl.show() pl.close() pl.figure(figsize=(10, 8)) Y = dados.iloc[:, -1]
import numpy as np from scipy import stats from statsmodels.stats.diagnostic import lilliefors import matplotlib.pyplot as plt x = stats.norm.rvs(0, 10, size=100) print(stats.kstest(x, 'norm', args=(np.mean(x), np.sqrt(np.var(x))))) print(lilliefors(x, dist='norm')) p_ks, p_l = [], [] for i in range(1000): x = stats.norm.rvs(0, 10, size=100) p_ks.append( stats.kstest(x, 'norm', args=(np.mean(x), np.sqrt(np.var(x))))[1]) p_l.append(lilliefors(x, dist='norm')[1]) X = np.linspace(0, 1, 1000) p_ks, p_l = sorted(p_ks), sorted(p_l) lab = ['ks', 'lilliefors'] fig = plt.subplots(figsize=(18, 12), dpi=400) plt.plot(X, p_ks) plt.plot(X, p_l) plt.legend(lab)
while True: r = arq.readline().split() if r != []: husbands.append(r) else: break hus = pd.DataFrame(husbands[1:], columns=husbands[0]) hus = hus.astype(int) # In[] pl.figure() # Idade marido par = hus.ageh[hus.ageh > 0] lil = lilliefors(par) par.hist(density=True, histtype='bar', ec='black', color='w') #pl.text(19, 27, 'Shapiro: '+str(round(scs.shapiro(par)[1], 5) ) +'\nKS:'+str(round(scs.kstest(par, 'norm')[1], 5) ), bbox=dict(facecolor='red', alpha=0.1) ) pl.grid() pl.text( 17, 0.031, 'Shapiro: ' + str(round(scs.shapiro(par)[1], 4)) #+'\nKS: '+str(round(scs.kstest(par, 'norm')[1], 4)) + '\nLillie: ' + str(round(lil[1], 4)), bbox=dict(facecolor='red', alpha=0.1)) pl.title('Idade dos Maridos') pl.xlabel('Idade') pl.ylabel('Probabilidade')
def L(x, alfa=0.05): D, p_l = lilliefors(x, 'norm')#, args=(0, 1)) if p_l < alfa: return 0 else: return 1
y[1] df40 = df[df.Age > 35] df40.PercentSalaryHike.sample(50).hist() y = anderson(df40.PercentSalaryHike.sample(30)) print(y[0]) print(y[1]) print(y[2] / 100) shapiro(df.Age) shapiro(df.Age.sample(50)) shapiro(df40.PercentSalaryHike) lilliefors(df.Age) lilliefors(df.Age.sample(50)) lilliefors(df40.PercentSalaryHike) lilliefors(df40.PercentSalaryHike.sample(50)) #Min/Max value testing on skewed dist - sampled import seaborn as sns from scipy.stats import probplot import matplotlib.pyplot as plt from scipy.stats import kurtosis anderson_statistic_30 = [] for i in range(1, 1000): anderson_statistic_30.append( anderson(df40.PercentSalaryHike.sample(30))[0]) sns.distplot(anderson_statistic_30)