def check_significance(gt_dist, seg_dist): normal_gt = normaltest(gt_dist)[1] normal_seg = normaltest(seg_dist)[1] # if both distributions are parametric use t-test, else use mann-whitney-u if normal_gt > 0.05 and normal_seg > 0.05: pvalue = ttest_ind(gt_dist, seg_dist)[1] else: pvalue = mannwhitneyu(gt_dist, seg_dist)[1] if pvalue > 0.05: return 0 if 0.01 < pvalue < 0.05: return 1 if 0.001 < pvalue < 0.01: return 2 if pvalue < 0.001: return 3 return pvalue
def same_mean(series_1, series_2, significance): """ Check the variance and distribution and then make hypothesis test for the mean The variance and normal distribution test is needed to check whether a t-test could be used, since these two requirements are needed for the t-test. If these requirements are not met, the Mann-Whitney-Wilcoxon RankSum test is used :param series1: Pandas Series for first attribute :type series_1: pandas.Series :param series2: Pandas Series for second attribute :type series_2: pandas.Series :param significance: Test significance (normally 5%) :type significance: float :rtype: bool """ normaltest_series_1 = normaltest(series_1) normaltest_series_2 = normaltest(series_2) if series_1.var() == series_2.var() and\ normaltest_series_1[1] <= significance and\ normaltest_series_2[1] <= significance: result, p_value = stats.ttest_ind(series_1, series_2) else: result, p_value = stats.ranksums(series_1, series_2) # A small p value means the probability that values like the ones occur given that # both series have the same mean is small -> They don't have the same mean if p_value <= significance: return False, result, p_value else: return True, result, p_value
def explore_city_data(city_data): """Calculate the Boston housing statistics.""" # Get the labels and features from the housing data housing_prices = city_data.target housing_features = city_data.data ################################### ### Step 1. YOUR CODE GOES HERE ### ################################### # Please calculate the following values using the Numpy library # Size of data (number of houses)? houses_dims = housing_features.shape print "The number of houses is " + str(houses_dims[0]) # Number of features? print "The number of features is " + str(houses_dims[1]) # Minimum price? print "The minimum price of a house is " + str(housing_prices.min()) + " thousand." # Maximum price? print "The maximum price of a house is " + str(housing_prices.max()) + " thousand." # Calculate mean price? print "The average price of a house is " + str(round(housing_prices.mean(),1)) + " thousand." # Calculate median price? print "The median price of a house is " + str(np.median(housing_prices)) + " thousand." # Calculate standard deviation? print "The standard deviation of housing prices is " + str(round(housing_prices.std(),1)) + " thousand." q75,q25 = np.percentile(housing_prices,[75,25]) print "The IQR deviation of housing prices is " + str(q75-q25) print normaltest(housing_prices) plt.hist(housing_prices) plt.title("Boston Housing Prices") plt.xlabel("Prices in $1000s") plt.ylabel("Frequency")
def test_axis_None(self): # Test axis=None (equal to axis=0 for 1-D input) x = np.array((-2,-1,0,1,2,3)*4)**2 assert_allclose(mstats.normaltest(x, axis=None), mstats.normaltest(x)) assert_allclose(mstats.skewtest(x, axis=None), mstats.skewtest(x)) assert_allclose(mstats.kurtosistest(x, axis=None), mstats.kurtosistest(x))
def normality(): data_dropped_na = data.dropna() column_name = "D’Agostino and Pearson’s Normality Test" formula = var_formula.get() if formula == '': print_status('Warning: Please, specify column names in formula.', 'red') return x_list = formula.split('~')[0].split('+') y = None try: y = formula.split('~')[1] except: pass test_list = [] p_value_list = [] index_list = [] for x in x_list: if x not in data_dropped_na.columns: print_status("Warning: No such continuous column.", 'red') return if y is not None and y not in data_dropped_na.columns: print_status('Warning: No such categorical column.', 'red') return if y is None: test, p_value = normaltest(data_dropped_na[x]) test_list.append(test) p_value_list.append(p_value) index_list.append(x) else: for i in set(data_dropped_na[y]): test, p_value = normaltest( data_dropped_na[data_dropped_na[y] == i][x]) test_list.append(test) p_value_list.append(p_value) index_list.append(x + '[' + i + ']') df = pd.DataFrame({ column_name: test_list, "p Value": p_value_list }, index=index_list) writer = pd.ExcelWriter('../../Analysis/Normality.xlsx') df.to_excel(writer, sheet_name='Sheet1', startcol=1) # df.to_excel(writer, sheet_name='Sheet1', startcol=7) writer.save() print_status('Status: Normality test performed', 'black') os.startfile('../../Analysis\\Normality.xlsx')
def compare_best_to_baseline(X_train, y_train, X_test, y_test, base_estimator, best_estimator): ###Compare the baseline model to the best predictor on the test set #Use baseline model to get CV data on test set random.seed = 1 n_test = X_test.shape[0] X_test_base = X_test.loc[:, 'ln_cum_char'].reshape((n_test, 1)) baseline_test_scores = cross_validation.cross_val_score( base_estimator, X_test_base, y_test, scoring='mean_squared_error', cv=10) #Use best model to get CV data on test set feature_list = ('ln_cum_char', 'percent_seen', 'mean_days_since', 'mean_term_freq', 'norm_t1', 'norm_t2', 'norm_t3') # X_test_sub = X_test.loc[:, feature_list] best_test_scores = cross_validation.cross_val_score( best_estimator, X_test_sub, y_test, scoring='mean_squared_error', cv=10) #Calculate statistics to compare samples from baseline and best model p_base_normality = normaltest(baseline_test_scores)[1] p_best_normality = normaltest(best_test_scores)[1] corr_p_value = pearsonr(baseline_test_scores, best_test_scores) t_P_value = ttest_ind(baseline_test_scores, best_test_scores)[1] print "Normality test for baseline CV MSE gives a p-value of %0.4f" % p_base_normality print "Normality test for best model's CV MSE gives a p-value of %0.4f" % p_best_normality print '''The Pearson correlation coefficient between the baseline and best model scores is %0.4F, and the correlation p-value is %0.4F''' % ( corr_p_value[0], corr_p_value[1]) print "t-test for independece between baseline and best model gives a p-value of %0.4f" % t_P_value y_test_base = base_estimator.predict( X_test_base) #Estimate y with model created from training set MSE_base = mean_squared_error( y_test, y_test_base) #MSE on test for model based on training set print "The non-CV MSE for the baseline is %0.4f" % MSE_base #Best MSE on test set y_test_best = best_estimator.predict(X_test_sub) MSE_best = mean_squared_error(y_test, y_test_best) print "The non-CV MSE for the best model is %0.4f" % MSE_best return (y_test_base, y_test_best)
def plot_effort_density(data, efforts): a = [] for class_name, effort in efforts.items(): class_data = list(filter(lambda c: c['parent'] == class_name, data))[0] a.append((float(class_data['normalized_density']), effort)) x, y = zip(*a) print("Normal test normalized density:", mst.normaltest(x)) print("Normal test effort:", mst.normaltest(y)) print("[Pearson]", st.pearsonr(x, y)) print("[Spearman]", st.spearmanr(x, y)) plt.scatter(x, y) plt.xlabel('Normalized density') plt.ylabel('Average wasted effort') plt.title('Normalized density vs. average wasted effort') plt.grid(True) plt.show()
def isNormalDistribution(df, alpha, shapiro=True): print "\nChecking if the columns follow a normal distribution by d'Agostino & Pearson or Shpapiro test...\n" #list of column except the "quality" h = list(df.columns.values) count = 0 for i in h: u, v = ss.shapiro(df[i]) k, p = mstats.normaltest(df[i]) if (shapiro): if v < alpha: print " The null hypothesis can be rejected; Column: ", i, "\n" count += 1 else: print " The null hypothesis can not be rejected; Column: ", i, "\n" else: if p < alpha: print " The null hypothesis can be rejected; Column: ", i, "\n" count += 1 else: print " The null hypothesis can not be rejected; Column: ", i, "\n" if count == len(h): print "\n\n Any column follows a normal distribution\n"
def spearman_with_errors(x, y, yerr, Nmc=1000, plotflag=False, verbose=False): ysim = np.zeros(Nmc, 'f') rhosim = np.zeros(Nmc, 'f') psim = np.zeros(Nmc, 'f') for i in range(Nmc): ysim = np.random.normal(y, scale=yerr, size=len(y)) rhosim[i], psim[i] = spearmanr(x, ysim) cave = np.mean(rhosim) cstd = np.std(rhosim) q1 = 50 - 34 # mean minus one std lower = np.percentile(rhosim, q1) q2 = 50 + 34 # mean minus one std upper = np.percentile(rhosim, q2) print 'mean (median) = %5.2f (%5.2f), std = %5.2f' % ( cave, np.median(rhosim), cstd) print 'confidence interval from sorted list of MC fit values:' print 'lower = %5.2f (%5.2f), upper = %5.2f (%5.2f)' % (lower, cave - cstd, upper, cave + cstd) k, pnorm = normaltest(rhosim) print 'probability that distribution of slopes is normal = %5.2f' % (pnorm) if plotflag: plt.figure(figsize=(10, 4)) plt.subplot(1, 2, 1) plt.hist(rhosim, bins=10, normed=True) plt.xlabel(r'$Spearman \ \rho $') plt.axvline(x=cave, ls='-', color='k') plt.axvline(x=lower, ls='--', color='k') plt.axvline(x=upper, ls='--', color='k') plt.subplot(1, 2, 2) plt.hist(np.log10(psim), bins=10, normed=True) plt.xlabel(r'$\log_{10}(p \ value)$') return rhosim, psim
def get_stationarity_statistics(df): """ returns a list of stationary statistics for the dataframe being passed :param df: :return: """ # verify stationarity adfstat, pvalue, critvalues, resstore = adfuller(df, regression="nc", store=True, regresults=True) # D’Agostino and Pearson normality test of returns dagostino_results = normaltest(df) # Shapiro-Wilk normality test shapiro_results = shapiro(df) # Kolmogorov-Smirnov normality test ks_results = kstest(df, cdf='norm') # Anderson-Darling normality test anderson_results = anderson(df) # Kwiatkowski-Phillips-Schmidt-Shin normality test kpss_results = KPSS(df) return adfstat, pvalue, critvalues, resstore, dagostino_results, shapiro_results, ks_results, anderson_results, kpss_results
def testNormality(): ks,ps = [],[] for name in names: langData = dataDict[name].values k,p = normaltest(langData) ks.append(k), ps.append(p) return ks,ps
def test_maskedarray_input(self): # Add some masked values, test result doesn't change x = np.array((-2, -1, 0, 1, 2, 3) * 4) ** 2 xm = np.ma.array(np.r_[np.inf, x, 10], mask=np.r_[True, [False] * x.size, True]) assert_allclose(mstats.normaltest(xm), stats.normaltest(x)) assert_allclose(mstats.skewtest(xm), stats.skewtest(x)) assert_allclose(mstats.kurtosistest(xm), stats.kurtosistest(x))
def spearman_with_errors(x,y,yerr,Nmc=1000,plotflag=False,verbose=False): ysim=np.zeros(Nmc,'f') rhosim=np.zeros(Nmc,'f') psim=np.zeros(Nmc,'f') for i in range(Nmc): ysim=np.random.normal(y,scale=yerr,size=len(y)) rhosim[i],psim[i] = spearmanr(x,ysim) cave=np.mean(rhosim) cstd=np.std(rhosim) q1=50-34 # mean minus one std lower=np.percentile(rhosim,q1) q2=50+34 # mean minus one std upper=np.percentile(rhosim,q2) print 'mean (median) = %5.2f (%5.2f), std = %5.2f'%(cave,np.median(rhosim),cstd) print 'confidence interval from sorted list of MC fit values:' print 'lower = %5.2f (%5.2f), upper = %5.2f (%5.2f)'%(lower,cave-cstd, upper,cave+cstd) k,pnorm=normaltest(rhosim) print 'probability that distribution of slopes is normal = %5.2f'%(pnorm) if plotflag: plt.figure(figsize=(10,4)) plt.subplot(1,2,1) plt.hist(rhosim,bins=10,normed=True) plt.xlabel(r'$Spearman \ \rho $') plt.axvline(x=cave,ls='-',color='k') plt.axvline(x=lower,ls='--',color='k') plt.axvline(x=upper,ls='--',color='k') plt.subplot(1,2,2) plt.hist(np.log10(psim),bins=10,normed=True) plt.xlabel(r'$\log_{10}(p \ value)$') return rhosim,psim
def plot_effort_num_of_components(data, efforts): a = [] for class_name, effort in efforts.items(): class_data = list(filter(lambda x: x['parent'] == class_name, data))[0] a.append((float(class_data['number_of_components']), effort)) x, y = zip(*a) print("Normal test number_of_components:", mst.normaltest(x)) print("Normal test effort:", mst.normaltest(y)) print("[Pearson]", st.pearsonr(x, y)) print("[Spearman]", st.spearmanr(x, y)) plt.scatter(x, y) plt.xlabel('number_of_components') plt.ylabel('Average wasted effort') plt.title('number_of_components vs. average wasted effort') plt.grid(True) plt.show()
def normaltest_data(category): data, population = load_rating_data(category) z, pval = mstats.normaltest(data) print(category + " p value is " + str(pval)) if (pval < 0.01): print "Not normal distribution" else: print "normal"
def normaltest_data(category): data,population = load_rating_data(category) z,pval = mstats.normaltest(data) print(category+" p value is "+str(pval)) if(pval < 0.01): print "Not normal distribution" else: print "normal"
def test_maskedarray_input(self): # Add some masked values, test result doesn't change x = np.array((-2,-1,0,1,2,3)*4)**2 xm = np.ma.array(np.r_[np.inf, x, 10], mask=np.r_[True, [False] * x.size, True]) assert_allclose(mstats.normaltest(xm), stats.normaltest(x)) assert_allclose(mstats.skewtest(xm), stats.skewtest(x)) assert_allclose(mstats.kurtosistest(xm), stats.kurtosistest(x))
def is_normal(a): from scipy.stats import mstats #Check for normality z, pval = mstats.normaltest(a) if (pval < 0.05): return False else: return True
def is_normal(a): from scipy.stats import mstats #Check for normality z,pval = mstats.normaltest(a) if(pval < 0.05): return False else: return True
def effort_correlation(data, efforts): a = [] for class_name, percentage in efforts.items(): class_data = list(filter(lambda x: x['parent'] == class_name, data))[0] normalized_density = class_data['normalized_density'] diversity = class_data['diversity'] uniqueness = class_data['uniqueness'] if normalized_density and diversity and uniqueness: a.append((float(normalized_density), float(diversity), float(uniqueness), percentage)) nd, d, u, e = zip(*a) print("Normal test density:", mst.normaltest(nd)) print("Normal test diversity:", mst.normaltest(d)) print("Normal test uniqueness:", mst.normaltest(u)) print("Normal test effort:", mst.normaltest(e)) print("[Spearman] density vs. effort", st.spearmanr(nd, e)) print("[Spearman] diversity vs. effort", st.spearmanr(d, e)) print("[Spearman] uniqueness vs. effort", st.spearmanr(u, e)) print("[Pearson] density vs. effort", st.pearsonr(nd, e)) print("[Pearson] diversity vs. effort", st.pearsonr(d, e)) print("[Pearson] uniqueness vs. effort", st.pearsonr(u, e))
def plot_uniqueness_vs_num_of_components(data): def transform(tuples): return [(float(u), int(c)) for u, c in tuples if u and c] uniqueness = _get_column(data, 'uniqueness') num_of_components = _get_column(data, 'number_of_tests') t = zip(uniqueness, num_of_components) u, c = zip(*transform(t)) print("Normal test uniqueness:", mst.normaltest(u)) print("Normal test components:", mst.normaltest(c)) print("[Pearson]", st.pearsonr(u, c)) print("[Spearman]", st.spearmanr(u, c)) plt.scatter(u, c) plt.xlabel('Uniqueness') plt.ylabel('Number of components') plt.title('Uniqueness vs. number of components') plt.grid(True) plt.show()
def test_vs_nonmasked(self): x = np.array((-2, -1, 0, 1, 2, 3) * 4) ** 2 assert_array_almost_equal(mstats.normaltest(x), stats.normaltest(x)) assert_array_almost_equal(mstats.skewtest(x), stats.skewtest(x)) assert_array_almost_equal(mstats.kurtosistest(x), stats.kurtosistest(x)) funcs = [stats.normaltest, stats.skewtest, stats.kurtosistest] mfuncs = [mstats.normaltest, mstats.skewtest, mstats.kurtosistest] x = [1, 2, 3, 4] for func, mfunc in zip(funcs, mfuncs): assert_raises(ValueError, func, x) assert_raises(ValueError, mfunc, x)
def plot_effort_ddu(data, efforts): a = [] for class_name, effort in efforts.items(): class_data = list(filter(lambda x: x['parent'] == class_name, data))[0] a.append((float(class_data['ddu']), effort)) x, y = zip(*a) print("Normal test DDU:", mst.normaltest(x)) print("Normal test effort:", mst.normaltest(y)) print("[Pearson]", st.pearsonr(x, y)) print("[Spearman]", st.spearmanr(x, y)) plt.scatter(x, y) plt.xlabel('DDU') plt.ylabel('Average wasted effort') plt.title('DDU vs. average wasted effort') plt.grid(True) plt.xlim(0, 1.0) plt.ylim(0, 1.0) z = numpy.polyfit(x, y, 1) p = numpy.poly1d(z) plt.plot(x, p(x), "r-") plt.show()
def plot_uniqueness_and_tests(data): """ Test correlation between uniqueness and number of tests only for classes that have two or more components. """ uniqueness = _get_column(data, 'uniqueness') components = _get_column(data, 'number_of_components') tests = _get_column(data, 'number_of_tests') d = zip(uniqueness, components, tests) d = [(float(u), int(c), int(t)) for u, c, t in d if u and c and t] d = [(u, c, t) for u, c, t in d if c > 1] u, c, t = zip(*d) print("Normal test uniqueness:", mst.normaltest(u)) print("Normal test components:", mst.normaltest(t)) print("[Pearson]", st.pearsonr(u, t)) print("[Spearman]", st.spearmanr(u, t)) plt.scatter(u, t) plt.xlabel('Uniqueness') plt.ylabel('Number of components') plt.title('Uniqueness vs. number of components') plt.grid(True) plt.show()
def compare_best_to_baseline(X_train, y_train, X_test, y_test, base_estimator, best_estimator): ###Compare the baseline model to the best predictor on the test set #Use baseline model to get CV data on test set random.seed = 1 n_test = X_test.shape[0] X_test_base = X_test.loc[:, 'ln_cum_char'].reshape((n_test, 1)) baseline_test_scores = cross_validation.cross_val_score(base_estimator, X_test_base, y_test, scoring='mean_squared_error', cv=10) #Use best model to get CV data on test set feature_list = ('ln_cum_char', 'percent_seen', 'mean_days_since', 'mean_term_freq', 'norm_t1', 'norm_t2', 'norm_t3') # X_test_sub = X_test.loc[:, feature_list] best_test_scores = cross_validation.cross_val_score(best_estimator, X_test_sub, y_test, scoring='mean_squared_error', cv=10) #Calculate statistics to compare samples from baseline and best model p_base_normality = normaltest(baseline_test_scores)[1] p_best_normality = normaltest(best_test_scores)[1] corr_p_value = pearsonr(baseline_test_scores, best_test_scores) t_P_value = ttest_ind(baseline_test_scores, best_test_scores)[1] print "Normality test for baseline CV MSE gives a p-value of %0.4f" % p_base_normality print "Normality test for best model's CV MSE gives a p-value of %0.4f" % p_best_normality print '''The Pearson correlation coefficient between the baseline and best model scores is %0.4F, and the correlation p-value is %0.4F''' % (corr_p_value[0], corr_p_value[1]) print "t-test for independece between baseline and best model gives a p-value of %0.4f" % t_P_value y_test_base = base_estimator.predict(X_test_base) #Estimate y with model created from training set MSE_base = mean_squared_error(y_test, y_test_base) #MSE on test for model based on training set print "The non-CV MSE for the baseline is %0.4f" % MSE_base #Best MSE on test set y_test_best = best_estimator.predict(X_test_sub) MSE_best = mean_squared_error(y_test, y_test_best) print "The non-CV MSE for the best model is %0.4f" % MSE_best return (y_test_base, y_test_best)
def test_vs_nonmasked(self): x = np.array((-2,-1,0,1,2,3)*4)**2 assert_array_almost_equal(mstats.normaltest(x), stats.normaltest(x)) assert_array_almost_equal(mstats.skewtest(x), stats.skewtest(x)) assert_array_almost_equal(mstats.kurtosistest(x), stats.kurtosistest(x)) funcs = [stats.normaltest, stats.skewtest, stats.kurtosistest] mfuncs = [mstats.normaltest, mstats.skewtest, mstats.kurtosistest] x = [1, 2, 3, 4] for func, mfunc in zip(funcs, mfuncs): assert_raises(ValueError, func, x) assert_raises(ValueError, mfunc, x)
def test_normality(self, data): """ Tests whether a sample differs from a normal distribution. Returns a 2-tuple of the chi-squared statistic, and the associated p-value. Given the null hypothesis that x came from a normal distribution, If the p-val is very small (alpha level of 0.05 normally), it means it is unlikely that the data came from a normal distribution. Other possible way: https://docs.scipy.org/doc/scipy-0.15.1/reference/generated/scipy.stats.chisquare.html """ # equivalent: print stats.normaltest(data) print "z value and p value: "#, z, pval z,pval = mstats.normaltest(data) if(pval < 0.05): print "Not normal distribution" return z, pval
def run_correlation(df): global FEATURES global P_SIGNIFICANT results = [] for intention in INTENTION_COLUMNS: for feature in FEATURES: res = {'feature': feature, 'intention': intention} group1 = df[df["intent_current_" + intention] == 0][feature].tolist() group2 = df[df["intent_current_" + intention] == 1][feature].tolist() are_norm = True (s, p) = mstats.normaltest(group1) are_norm = are_norm and (p > P_SIGNIFICANT) (s, p) = mstats.normaltest(group2) are_norm = are_norm and (p > P_SIGNIFICANT) if are_norm: (s, p) = stats.f_oneway(group1, group2) res['test'] = 'One-way ANOVA' res['statistic'] = s res['p'] = p res['mean_0'] = np.mean(group1) res['mean_1'] = np.mean(group2) else: (s, p) = mstats.kruskalwallis(group1, group2) res['test'] = 'Kruskal-Wallis' res['statistic'] = s res['p'] = p res['mean_0'] = np.mean(group1) res['mean_1'] = np.mean(group2) results += [res] return results
def run_correlation(df, feature, outcome): # print("FEATURE",feature,"OUTCOME",outcome) # print(len(df.index)) P_SIGNIFICANT = .05 outcomes = set(df[outcome].tolist()) n_outcomes = len(outcomes) # print("N OUTCOMES",n_outcomes) groups = [] for oc in outcomes: groups += [df[df[outcome] == oc][feature].tolist()] are_norm = True for g in groups: # print(g,len(g)) (s, p) = mstats.normaltest(g) are_norm = are_norm and (p > P_SIGNIFICANT) result = {} if are_norm: if n_outcomes <= 2: (s, p) = stats.ttest_ind(groups[0], groups[1]) result['test'] = 't-test' else: (s, p) = stats.f_oneway(*groups) result['test'] = 'One-way ANOVA' result['statistic'] = s result['p'] = p for (n, g) in zip(range(len(groups)), groups): result['mean_%d' % n] = np.mean(g) else: if n_outcomes <= 2: (s, p) = stats.mannwhitneyu(groups[0], groups[1]) result['test'] = 'Mann-Whitney' else: # print(len(groups),len(groups[0])) (s, p) = mstats.kruskalwallis(*groups) result['test'] = 'Kruskal-Wallis' result['statistic'] = s result['p'] = p for (n, g) in zip(range(len(groups)), groups): result['mean_%d' % n] = np.mean(g) return result
def plot_distribution_along_axis(X_embedded, X, axes): for axis in axes: nr_categories = 1 colors = cm.rainbow(np.linspace(0, 1, nr_categories)) for category_index, c in zip(range(nr_categories), colors): x_projected = [] for record_embedded, record in zip(X_embedded, X): if True: projected = np.dot(axis, record_embedded) x_projected.append(projected) hist, bins = np.histogram(x_projected, bins=50) width = 0.7 * (bins[1] - bins[0]) center = (bins[:-1] + bins[1:]) / 2 plt.bar(center, hist, align='center', width=width) popt, pcov = curve_fit(gaus, center, hist, p0=[1.0, 0.0, 1.0]) plt.plot(center, gaus(center, *popt), color='red', linewidth=2) print(normaltest(x_projected)) plt.show()
def generate_histogram(df, columns, normality_value): #Column reduction df_col_reduction = df df_col_reduction['red_col'] = df_col_reduction.apply(column_reduction, axis=1) #Perform normality test normality_result = normaltest(df_col_reduction['red_col']) similarity_value = normality_result[0] pvalue = normality_result[1] print("Histogram for the data set.") histogram_df = df_col_reduction['red_col'] fig = plt.figure(figsize=__FIG_SIZE__) plt.gcf().clear() histogram_df.hist(normed=True) histogram_df.plot(kind = 'kde', linewidth = 2, \ color = 'r', label = 'Distribution Of Dataset') norm_fit = stats.norm.pdf(np.linspace(-3, 3, len(histogram_df)), np.mean(histogram_df), np.std(histogram_df)) plt.plot(np.linspace(-3, 3, len(histogram_df)), norm_fit, label="Normal Distribution", color='k', linewidth=2) # plot it plt.xlabel("Dataset Distribution") plt.ylabel("Frequency") plt.title("Similarity to normal distribution: " + str(similarity_value) + ", pvalue: " + str(pvalue)) plt.legend() plt.show() return similarity_value, df_col_reduction
def __init__(self, values, stdDevs): new_values = [] for i in values: if i != '': try: if "." in i: new_values.append(float(i)) else: new_values.append(int(i)) except: pass #already picked up by error checks values = new_values super().__init__(values) self.stDevOutliers = [] standardDeviations = Decimal(stdDevs) if len(values) >= 8: self.pval = mstats.normaltest(array(values))[1] else: self.pval = 100 self.min = min(values) self.max = max(values) self.mean = Decimal(mean(values)).quantize(Decimal('.00000')) self.median_low = median_low(values) self.median = median(values) self.median_high = median_high(values) self.stdev = Decimal(stdev(values)).quantize(Decimal('.00')) self.normDist = 'No' if(self.pval > 0.055): self.normDist = 'Yes' elif self.pval == 100: self.normDist = 'N/A' if self.normDist == 'Yes': outlier_count = 0 for x, value in enumerate(values): if value < (self.mean - standardDeviations * self.stdev) or \ value > (self.mean + standardDeviations * self.stdev): if outlier_count > max_Outliers: self.stDevOutliers = ">%d outliers" % max_Outliers break self.stDevOutliers.append("Row: %d Value: %s" % (x, value)) outlier_count += 1
def __init__(self, values, stdDevs): standardDeviations = stdDevs new_values = [] for i in values: if i != '': try: new_values.append(float(i)) except: pass #already picked up in error checks values = new_values super().__init__(values) self.stDevOutliers = [] if len(values) >= 8: self.pval = mstats.normaltest(array(values))[1] else: self.pval = 100 if self.mode != 'N/A': self.mode = self.int_to_sci(self.mode) self.min = self.int_to_sci(min(values)) self.max = self.int_to_sci(max(values)) self.mean = self.int_to_sci(mean(values)) self.median_low = self.int_to_sci(median_low(values)) self.median = self.int_to_sci(median(values)) self.median_high = self.int_to_sci(median_high(values)) self.stdev = self.int_to_sci(stdev(values)) self.normDist = 'No' if(self.pval < 0.055): self.normDist = 'Yes' elif(self.pval == 100): self.normDist = 'N/A' if self.normDist == 'Yes': outlier_count = 0 for x, value in enumerate(values): if value < (float(self.mean) - standardDeviations * float(self.stdev)) or \ value > (float(self.mean) + standardDeviations * float(self.stdev)): if outlier_count > max_Outliers: self.stDevOutliers = ">%d outliers" % max_Outliers break self.stDevOutliers.append("Row: %d Value: %s" % (x, value)) outlier_count += 1
def bivariateNormalTest(self, df): # get the data n = len(COMMON_COLUMNS) # get columns a = np.array(df[COMMON_COLUMNS[2]]) b = np.array(df[COMMON_COLUMNS[3]]) print(a) print(b) temp = np.append(a, b) print(temp.shape) print(normaltest(temp)) # set up return matrix mat = np.empty([n, n]) # iterate through matrix for i in range(n): for j in range(n): ci = df[COMMON_COLUMNS[i]] cj = df[COMMON_COLUMNS[j]] temp = pd.DataFrame([ci, cj])
def test(log): stats = statistics(log) # for i, stat in enumerate(stats): # if stat: # print "%d \t %.8f \t %.8f \t %s \t %d " %(i+1, stat['rtt_m'], stat['rtt_std'], str(stat['d_rtt_m']) if 'd_rtt_m' in stat else "*" , stat['n']) samples = [ s for s in stats if s and 'd_rtt_m' in s and s['d_rtt_m'] != '*' and s['d_rtt_m'] > 0 ] samples_rtt = [ s['d_rtt_m'] for s in samples ] print "== Test de normalidad ==\n" print "p-value = {}\n".format(normaltest(samples_rtt)[1]) for k in range(len(samples)): print "MAX: {}".format(max(samples_rtt)) G, a, G_crit = grubbs(samples_rtt) hop_to = max((s for s in samples if s['d_rtt_m'] in samples_rtt), key = lambda s: s['d_rtt_m'])['ip'] for i in range(len(stats) - 1): if stats[i+1] and 'ip' in stats[i+1] and stats[i+1]['ip'] == hop_to: hop_from = stats[i]['ip'] print """ == Test de outliers de Grubbs #{} == G = {} a = {} G_crit = {} Hop: {} -> {} """.format(k, G, a, G_crit, hop_from, hop_to) if G > G_crit: samples_rtt.remove(max(samples_rtt)) else: break
def are_different(data, factor, metric, threshold = 0.05): results = [] tested_values = [] values = data["denormalized"][factor].unique() for value in values: results.append(data["denormalized"].loc[(data["denormalized"][factor] == value)][metric]) for value, result in zip(values,results): print(value, result.mean()) if mstats.normaltest(result)[1] < 0.05: parametric = False print() if parametric: print("Parametric test") else: print("NON Parametric test") print() for value, result in zip(values,results): for value2, result2 in zip(values, results): if not value == value2 and value2 not in tested_values: tested_values.append(value) if not parametric: # z_stat, p_val = wilcoxon(result, result2, zero_method='wilcox', correction=False) z_stat, p_val = ttest_ind(result, result2, equal_var=False) else: z_stat, p_val = ttest_ind(result, result2, equal_var=False) if p_val < threshold: # 0.05 print("Statistically significant different results between %s and %s" % (value, value2)) else: print("Statistically NON-significant different results between %s and %s" % (value, value2))
def plot_box_resids(fit_model, y_pred, subset=None): '''More than you ever wanted to know about your residuals''' s_resid = (fit_model.resid - np.mean(fit_model.resid)) /\ np.var(fit_model.resid) if subset: s_resid = np.random.choice(s_resid, replace=False, size=math.floor(len(s_resid) * subset)) df = pd.DataFrame(s_resid, columns=['resids']) temp_df = pd.DataFrame(y_pred, columns=['target']) df = df.join(temp_df) if min(y_pred) < -1: df['turnout_bucket'] = df['target']\ .apply(lambda x: int(math.floor(10 * np.exp(x)))) y = df['target'].apply(lambda x: np.exp(x)) else: df['turnout_bucket'] = df['target']\ .apply(lambda x: int(math.floor(10 * x))) y = df['target'] posit = sorted(df['turnout_bucket'].unique()) plt.scatter(y, s_resid, alpha=.2) slope, intercept = np.polyfit(y, s_resid, 1) plt.plot(y, np.poly1d(np.polyfit(y, s_resid, 1))(y)) plt.title('Studentized Residuals vs Prediction') plt.xlabel('Predicted Value') plt.ylabel('Studentized Residual') print 'Slope of best fit line: %s' % slope plt.show() ax1 = df[['resids', 'turnout_bucket']]\ .boxplot(by='turnout_bucket', positions=posit, widths=.5) plt.title('Residuals versus Turnout') plt.xlabel('Turnout Bucket') plt.ylabel('Studentized Residuals') plt.suptitle('') plt.show() fig = sm.qqplot(s_resid, line='s') plt.title('Q-Q Plot') plt.show() w, p_val = shapiro(s_resid) print 'Shapiro-Wilk P_val is %s, larger the better' % p_val k, p_val = normaltest(s_resid) print 'D’Agostino and Pearson’s P_val is %s, larger the better' % p_val k, p_val = kstest(s_resid, 'norm') print 'Kolmogorov–Smirnov P_val is %s, larger the better' % p_val A, critical, sig = anderson(s_resid) print 'Anderson-Darling A2 is %s, smaller the better' % A print critical print sig n, bins, patches = plt.hist(s_resid, 75, normed=1) mu = np.mean(s_resid) sigma = np.std(s_resid) plt.plot(bins, mlab.normpdf(bins, mu, sigma)) plt.title('Residuals versus a Normal Dist') plt.show() df['turnout_bucket'].hist(bins=posit, align='left', color='b') plt.title('Histogram of Turnout Bucket') plt.ylabel('Count') plt.xlim(-.5, -.5 + len(posit)) temp = df[['resids', 'turnout_bucket']].groupby('turnout_bucket').count() temp.columns = ['Count'] plt.show() print temp
plt.setp(r1['caps'], color='black',lw=1.5) plt.setp(r1['medians'], color='black',lw=1.5) plt.setp(r2['boxes'], color='black',lw=1.5) plt.setp(r2['whiskers'], color='black',lw=1.5) plt.setp(r2['caps'], color='black',lw=1.5) plt.setp(r2['medians'], color='black',lw=1.5) ax.set_ylabel('TOTAL EDDY AREA, IN METERS SQUARED') ax.get_yaxis().set_major_formatter(tkr.FuncFormatter(lambda x, p: format(int(x), ','))) plt.tight_layout() plt.savefig(r"C:\workspace\Time_Series\Output\Joes_Figs\grouped_mc_area_boxplot.png",dpi=600) from scipy.stats.mstats import normaltest, skewtest print 'old ', normaltest(area_old) print 'combined ', normaltest(combined) print 'old ', skewtest(area_old) print 'combined ', skewtest(combined) a = probplot(area_old,dist='norm', plot=None) b= probplot(combined,dist='norm', plot=None) colors = {'r':'red','s':'blue', 'u':'green'} markers = {'r':'*','s':'x', 'u':'o'} old_df = pd.DataFrame(area_old, columns=['Long Term Sites: N=12']) old_df['Bar_Type'] = lt_bt old_df = old_df.sort_values(by='Long Term Sites: N=12') old_df['quart']=a[0][0]
def plot_box_resids(fit_model, y_pred, subset=None): '''More than you ever wanted to know about your residuals''' s_resid = (fit_model.resid - np.mean(fit_model.resid)) /\ np.var(fit_model.resid) if subset: s_resid = np.random.choice(s_resid, replace=False, size=math.floor(len(s_resid) * subset)) df = pd.DataFrame(s_resid, columns=['resids']) temp_df = pd.DataFrame(y_pred, columns=['target']) df = df.join(temp_df) if min(y_pred) < -1: df['turnout_bucket'] = df['target']\ .apply(lambda x: int(math.floor(10 * np.exp(x)))) y = df['target'].apply(lambda x: np.exp(x)) else: df['turnout_bucket'] = df['target']\ .apply(lambda x: int(math.floor(10 * x))) y = df['target'] posit = sorted(df['turnout_bucket'].unique()) plt.scatter(y, s_resid, alpha=.2) slope, intercept = np.polyfit(y, s_resid, 1) plt.plot(y, np.poly1d(np.polyfit(y, s_resid, 1))(y)) plt.title('Studentized Residuals vs Prediction') plt.xlabel('Predicted Value') plt.ylabel('Studentized Residual') print 'Slope of best fit line: %s' % slope plt.show() ax1 = df[['resids', 'turnout_bucket']]\ .boxplot(by='turnout_bucket', positions=posit, widths=.5) plt.title('Residuals versus Turnout') plt.xlabel('Turnout Bucket') plt.ylabel('Studentized Residuals') plt.suptitle('') plt.show() fig = sm.qqplot(s_resid, line='s') plt.title('Q-Q Plot') plt.show() w, p_val = shapiro(s_resid) print 'Shapiro-Wilk P_val is %s, larger the better' % p_val k, p_val = normaltest(s_resid) print 'D’Agostino and Pearson’s P_val is %s, larger the better' % p_val k, p_val = kstest(s_resid, 'norm') print 'Kolmogorov–Smirnov P_val is %s, larger the better' % p_val A, critical, sig = anderson(s_resid) print 'Anderson-Darling A2 is %s, smaller the better' % A print critical print sig n, bins, patches = plt.hist(s_resid, 75, normed=1) mu = np.mean(s_resid) sigma = np.std(s_resid) plt.plot(bins, mlab.normpdf(bins, mu, sigma)) plt.title('Residuals versus a Normal Dist') plt.show() df['turnout_bucket'].hist(bins=posit, align='left', color='b') plt.title('Histogram of Turnout Bucket') plt.ylabel('Count') plt.xlim(-.5, - .5 + len(posit)) temp = df[['resids', 'turnout_bucket']].groupby('turnout_bucket').count() temp.columns = ['Count'] plt.show() print temp
def test_normaltest_result_attributes(self): x = np.array((-2, -1, 0, 1, 2, 3)*4)**2 res = mstats.normaltest(x) attributes = ('statistic', 'pvalue') check_named_results(res, attributes, ma=True)
def test_normaltest_result_attributes(self): x = np.array((-2, -1, 0, 1, 2, 3) * 4)**2 res = mstats.normaltest(x) attributes = ('statistic', 'pvalue') check_named_results(res, attributes, ma=True)
def compare(dataset): df = pd.read_csv(dataset) df_num_rows = len(df.index) df_num_cols = len(df.columns) # Calculate number of samples to use as an example training set # for which the degree to which it is a normal distribution # will be determined. Must have at least two samples if __TRAINING_TEST_SPLIT__ != None: num_samples = max(2, int(df_num_rows * __TRAINING_TEST_SPLIT__)) #print("Starting to compute the degree of match between ") #print(" a training and test data sets over ", __NUM_ITERATIONS__, " iteration(s)") iter_ctr = 1 fig_ctr = 1 for _ in itertools.repeat(None, __NUM_ITERATIONS__): dfsvc_train = df if __TRAINING_TEST_SPLIT__ != None: # Randomly select num_samples from df new_df = df.sample(n=num_samples) new_df_num_rows = len(new_df.index) new_df_num_cols = len(new_df.columns) # Extract trainig and test data sets dfsvc_train = df.sample(frac = __TRAINING_TEST_SPLIT__) dfsvc_test = pd.concat([dfsvc_train, df]).loc[dfsvc_train.index.symmetric_difference(df.index)] # Training data __PREDICTOR_VARIABLES__ = df.columns[2:] X = dfsvc_train[__PREDICTOR_VARIABLES__] if __TRAINING_TEST_SPLIT__ != None: X_test = dfsvc_test[__PREDICTOR_VARIABLES__] # Scale the data set from -1 to 1 print ("\n\n Scaling data set between [-1., 1.]" ) scaler = MinMaxScaler(feature_range = (-1., 1.)) X_scaled = scaler.fit_transform(X) if __TRAINING_TEST_SPLIT__ != None: X_test_scaled = scaler.fit_transform(X_test) # Generate histograms for both classes in both the training and test data sets # First compute vector sum of samples for training set #print(" Deterining the degree of fit between training and test data to a normal distribution.") col_names = X.columns df_X_scaled = pd.DataFrame(X_scaled, columns = col_names) if __TRAINING_TEST_SPLIT__ != None: df_X_test_scaled = pd.DataFrame(X_test_scaled, columns = col_names) # Make copy of data frames and compute vector sum in preparation to # generate histograms df_X_scaled_vecsum = df_X_scaled df_X_scaled_vecsum['vec_sum'] = df_X_scaled_vecsum.apply(comp_vec_sum, axis = 1) if __TRAINING_TEST_SPLIT__ != None: df_X_test_scaled_vecsum = df_X_test_scaled df_X_test_scaled_vecsum['vec_sum'] = df_X_test_scaled_vecsum.apply(comp_vec_sum, axis = 1) # Determine fit of training and test data to a normal distribution # That is, test the underlying assumption of the VC Dimension that # a normal disgtribution governs the distribution of the data. # Using the API: scipy.stats.mstats.normaltest: # Extract the vector sum info from the train and test data sets X_scaled_hist_data = df_X_scaled_vecsum['vec_sum'] if __TRAINING_TEST_SPLIT__ != None: X_test_scaled_hist_data = df_X_test_scaled_vecsum['vec_sum'] # Compute degree of match of data to normal dist X_scaled_hr = normaltest(X_scaled_hist_data) X_scaled_hr_match = X_scaled_hr[0] X_scaled_hr_match_pvalue = X_scaled_hr[1] print(" Data set match to normal dist: %.1f with p-value: %.4E" % \ (X_scaled_hr_match, Decimal(X_scaled_hr_match_pvalue))) if __TRAINING_TEST_SPLIT__ != None: X_test_scaled_hr = normaltest(X_test_scaled_hist_data) X_test_scaled_hr_match = X_test_scaled_hr[0] X_test_scaled_hr_match_pvalue = X_test_scaled_hr[1] print(" Test data set match to normal dist: %.1f with p-value: %.4E" % \ (X_test_scaled_hr_match, Decimal(X_test_scaled_hr_match_pvalue))) #print("Completed deterining the degree of fit of training and test data to normal distribution") #print(" for iteration: ", iter_ctr) # Display histograms for training and test data # See: http://danielhnyk.cz/fitting-distribution-histogram-using-python/ print("\n\nDisplaying histograms for data sets.") # Display training data first fig = plt.figure(fig_ctr, figsize = (__PLOT_SIZE_X__, __PLOT_SIZE_Y__)) fig_ctr = 1 + fig_ctr plt.gcf().clear() X_scaled_hist_data.hist(normed = True) X_scaled_hist_data.plot(kind = 'kde', linewidth = 2, \ color = 'r', label = 'Distribution Of Training Data') # find minimum and maximum of xticks, so we know # where we should compute theoretical distribution xt = plt.xticks()[0] xmin, xmax = min(xt), max(xt) lnspc = np.linspace(xmin, xmax, len(X_scaled_hist_data)) # Now display the normal distribution over the histogram of the # training data m, s = stats.norm.fit(X_scaled_hist_data) # get mean and standard deviation pdf_g = stats.norm.pdf(lnspc, m, s) # now get theoretical values in our interval plt.plot(lnspc, pdf_g, label="Normal Distribution", color = 'k', linewidth = 2) # plot it plt.xlabel("Training data feature vector distance/magnitude.") plt.ylabel("Frequency.") match_val = '%.2f' % Decimal(X_scaled_hr_match) match_p_val = '%.4E' % Decimal(X_scaled_hr_match_pvalue) title_str = "Histrogram and Distribution of training data overlayed with normal distribution. " \ + " Degree of match = " + match_val + " with p-value = " + match_p_val + "." plt.title("\n".join(wrap(title_str, __MATPLOTLIP_TITLE_WIDTH__))) leg = plt.legend(loc = 'best', ncol = 1, shadow = True, fancybox = True) leg.get_frame().set_alpha(0.5) plt.show() if __TRAINING_TEST_SPLIT__ != None: # Display test dataset next fig = plt.figure(fig_ctr, figsize = (__PLOT_SIZE_X__, __PLOT_SIZE_Y__)) fig_ctr = 1 + fig_ctr plt.gcf().clear() X_test_scaled_hist_data.hist(normed = True) X_test_scaled_hist_data.plot(kind = 'kde', linewidth = 2, \ color = 'r', label = 'Distribution Of Test Data') # find minimum and maximum of xticks, so we know # where we should compute theoretical distribution xt = plt.xticks()[0] xmin, xmax = min(xt), max(xt) lnspc = np.linspace(xmin, xmax, len(X_test_scaled_hist_data)) # Now display the normal distribution over the histogram of the test data m, s = stats.norm.fit(X_test_scaled_hist_data) # get mean and standard deviation pdf_g = stats.norm.pdf(lnspc, m, s) # now get theoretical values in our interval plt.plot(lnspc, pdf_g, label="Normal Distribution", color = 'k', linewidth = 2) # plot it plt.xlabel("Test data feature vector distance/magnitude.") plt.ylabel("Frequency.") match_val = '%.2f' % Decimal(X_test_scaled_hr_match) match_p_val = '%.4E' % Decimal(X_test_scaled_hr_match_pvalue) title_str = "Histogram and Distribution of test data overlayed with normal distribution." \ + " Degree of match = " + match_val + " with p-value = " + match_p_val + "." plt.title("\n".join(wrap(title_str, __MATPLOTLIP_TITLE_WIDTH__))) leg = plt.legend(loc = 'best', ncol = 1, shadow = True, fancybox = True) leg.get_frame().set_alpha(0.5) plt.show() #print("Completed displaying histograms for training and test data sets") #print(" for iteration: ", iter_ctr) # Increment iteration count iter_ctr = 1 + iter_ctr if iter_ctr <= __NUM_ITERATIONS__: print("") #print("Starting iteration: ", iter_ctr) else: print()
meas_table = table[table["Measurement"] == descriptor] # how to acess statistical values values_list = meas_table[stat_value].tolist() # adding values of interest to table for visualization #dataframe[tables] = values_list ### data_d.update({tables: values_list}) ### max_vals.append(np.max(values_list)) if normaltest(values_list)[1] > 0.05: normtest = "| Parametric distribution" normtest_list.append(True) else: normtest = "| Non-parametric distribution" normtest_list.append(False) print(tables, normtest, normaltest(values_list)[1]) print("\n") #print(data_d) # converting dictionary with different list lengths into a pandas dataframe dataframe = pd.DataFrame(dict([(k, pd.Series(v)) for k, v in data_d.items()]))
# plot acf and pacf of signals white_acf_pacf = plot_acf_pacf(signal=white_noise_signal, name='ACF_WN') blue_acf_pacf = plot_acf_pacf(signal=blue_noise_signal, name='ACF_BN') pink_acf_pacf = plot_acf_pacf(signal=pink_noise_signal, name='ACF_PN') # plot periodogram of signals hbo_specgram = plot_periodgram(signal=hbo_signal, name='FD_HBO', color='k') white_specgram = plot_periodgram(signal=extract_signal(file_name='white_noise.wav', num_frames=441000), name='FD_WN', color='k') blue_specgram = plot_periodgram(signal=extract_signal(file_name='blue_noise.wav', num_frames=441000), name='FD_BN', color='b') pink_specgram = plot_periodgram(signal=extract_signal(file_name='pink_noise.wav', num_frames=441000), name='FD_PN', color='m') plot_periodgram(plot_decomposition(), name='FD_SinFunc') # adf test hbo_adf = adfuller(hbo_signal) white_noise_adf = adfuller(white_noise_signal) blue_noise_adf = adfuller(blue_noise_signal) pink_noise_adf = adfuller(pink_noise_signal) # normality test hbo_norm = normaltest(hbo_signal) white_noise_norm = normaltest(white_noise_signal) blue_noise_norm = normaltest(blue_noise_signal) pink_noise_norm = normaltest(pink_noise_signal) # histogram plot plt.hist(white_noise_signal, bins=50) plt.hist(blue_noise_signal, bins=50) plt.hist(pink_noise_signal, bins=50)
print( " Deterining the degree of fit between training and test data to a normal distribution." ) col_names = X.columns df_X_scaled = pd.DataFrame(X_scaled, columns=col_names) # Make copy of data frames and compute vector sum in preparation to # generate histograms df_X_scaled_vecsum = df_X_scaled df_X_scaled_vecsum['vec_sum'] = df_X_scaled_vecsum.apply(comp_vec_sum, axis=1) # Extract the vector sum info from the train and test data sets X_scaled_hist_data = df_X_scaled_vecsum['vec_sum'] # Compute degree of match of data to normal dist X_scaled_hr = normaltest(X_scaled_hist_data) X_scaled_hr_match = X_scaled_hr[0] X_scaled_hr_match_pvalue = X_scaled_hr[1] print(" Data set match to normal dist: %.1f with p-value: %.4E" % \ (X_scaled_hr_match, Decimal(X_scaled_hr_match_pvalue))) print("Displaying histograms for the data set.") fig = plt.figure(fig_ctr, figsize=(__PLOT_SIZE_X__, __PLOT_SIZE_Y__)) fig_ctr = 1 + fig_ctr plt.gcf().clear() X_scaled_hist_data.hist(normed=True) X_scaled_hist_data.plot(kind = 'kde', linewidth = 2, \ color = 'r', label = 'Distribution Of The Data')
new_x = numpy.hstack((add_column, x_matrix)) #matrices multiplication step_one = numpy.dot (new_x.T, new_x) step_two = numpy.linalg.pinv(step_one) step_three = numpy.dot(step_two, new_x.T) coeffs = numpy.dot(step_three, y_matrix) errors = y_matrix - numpy.dot(new_x, coeffs) print coeffs #the model is too complicated (multidimentional) #are errors distributed normally? if yes, then the model is accurate import numpy as np import numpy.ma as ma from scipy.stats import mstats x = np.array(errors) z,pval = mstats.normaltest(x) #Tests whether a sample differs from a normal distribution. #This function tests the null hypothesis that a sample comes from a normal distribution print "Z-score:", z print "P-value:", pval if(pval < 0.055): print "Not normal distribution" if (pval >= 0.055): print "This seems to be a normal distribution! Our model is good"