def infer_ks_test_goodness(l1): # l = np.histogram(l1) # n = len(l) mean = average(l1) sigma = std(l1) res = kstest(l1, 'norm', [mean, sigma]) if res[1] < 0.01: print('reject') else: print('accept') print(res)
def logistic(dataset,name): "Logistic continous distribution and and throws KS Test Statistic either D,D+,D- test and p value as a result " if (dataset[name].dtype == 'int64' or dataset[name].dtype == 'float64'): dataset[name].dropna() x = np.array(dataset[name]) z, p = stats.kstest(x, 'logistic') if (p < 0.055): print 'It is Not a Exponential distribution' else: print 'It is a Exponential distribution' return z, p else: return None
def typedis(dataset,name,dis): "Type any type of ditribution . Dis is used to take in the type of code distribution visit refer http://docs.scipy.org/doc/scipy-0.14.0/reference/stats.html#module-scipy.stats for more reference and throws KS Test Statistic either D,D+,D- test and p value as a result " if (dataset[name].dtype == 'int64' or dataset[name].dtype == 'float64'): dataset[name].dropna() x = np.array(dataset[name]) z, p = stats.kstest(x, dis) if (p < 0.055): print 'It is Not as',dis,' distribution' else: print 'It is a',dis,'distribution' return z, p else: return None
def norm(dataset,name): "Normal test for normal distribution and throws KS Test Statistic either D,D+,D- test and p value as a result " if(dataset[name].dtype == 'int64' or dataset[name].dtype == 'float64'): dataset[name].dropna() x = np.array(dataset[name]) z,p = stats.kstest(x,'norm') if(p<0.055): print 'It is Not a normal distribution' else: print 'It is a normal distribution' return z,p else: return None
def welisberg(dataset,name): "Weibull continous distribution and throws KS Test Statistic either D,D+,D- test and p value as a result" if (dataset[name].dtype == 'int64' or dataset[name].dtype == 'float64'): dataset[name].dropna() x = np.array(dataset[name]) z, p = stats.kstest(x, 'dweibull') if (p < 0.055): print 'It is Not a Weibull distribution' else: print 'It is a weibull distribution' return z, p else: return None
def test_linear_studentt_parent_dist(self, graph): """ Kolmogorov-Smirnov test for data coming from a student-t (degree of freedom = 3). """ np.random.seed(10) data = generate_continuous_data(graph, distribution="student-t", noise_scale=1, n_samples=100000, seed=10) x = data[:, 0] _, p_val = stats.kstest(x, "t", args=[3]) assert p_val < 0.01
def freq(df,col,max1): "To find the required freq for the decompostion " count = None for i in range(1,max1): try: decomposed = seasonal_decompose(df[col].values, freq=i) decomposed.resid = decomposed.resid[[~np.isnan(decomposed.resid)]] print decomposed.resid ##decomposed.resid = [1,2,1,2,1,2] x = np.array(decomposed.resid) z,p = stats.kstest(x,'norm') if(p<0.055): print 'It is not the required freq' else: print 'it is the required freq' count = i except ValueError: pass decompose(df,col,i) return count
def main(): finaldatafile = "finaldata.json" finalData = None try: with open(finaldatafile) as data_file: finalData = json.load(data_file) except: print("Run analysis") exit() for appliName in finalData: cgscore, issuescore, classSize = finalData[appliName] j = 0 issueCallgraphValueForStats = [] callGraphValueForStats = [] issueSizeValueForStats = [] classSizeValueForStats = [] issueForModel = [] callGraphForModel = [] classSizeForModel = [] for key in issuescore: if key in cgscore: j+=1 issueCallgraphValueForStats.append(issuescore[key]) callGraphValueForStats.append(cgscore[key]) for key in issuescore: if key in classSize: issueSizeValueForStats.append(issuescore[key]) classSizeValueForStats.append(classSize[key]) for key in issuescore: if key in classSize: if key in cgscore: issueForModel.append(issuescore[key]) callGraphForModel.append(cgscore[key]) classSizeForModel.append(classSize[key]) if j>3: spearmanCorrelationCoefficient, spearmanpvalue = spearmanr(issueCallgraphValueForStats,callGraphValueForStats) kendalltauCorrelationCoefficient, kendalltaupvalue = kendalltau(issueCallgraphValueForStats,callGraphValueForStats) kstestdissueValueForStats, kstestpvalueissueValueForStats = kstest([issuescore[key] for key in issuescore],"norm") kstestdcgValueForGraph, kstestpvaluecgValueForGraph = kstest([cgscore[key] for key in cgscore],"norm") spearmanCorrelationCoefficient2, spearmanpvalue2 = spearmanr(issueSizeValueForStats,classSizeValueForStats) kendalltauCorrelationCoefficient2, kendalltaupvalue2 = kendalltau(issueSizeValueForStats,classSizeValueForStats) kstestdchissueSizeValueForStats, kstestpvaluechissueSizeValueForStats = kstest([issuescore[key] for key in issuescore],"norm") kstestdclassSizeValueForStats, kstestpvalueclassSizeValueForStats = kstest([classSize[key] for key in classSize],"norm") print(appliName) print("--- API Call <> Issue") print(" "*8 + "Spearman rho correlation coefficient = " + str(spearmanCorrelationCoefficient)) print(" "*8 + "Spearman p-value = " + str(spearmanpvalue)) print(" "*8 + "Kendall Tau = " + str(kendalltauCorrelationCoefficient)) print(" "*8 + "Kendall p-value = " + str(kendalltaupvalue)) print(" "*8 + "KS Test D = " + str(kstestdissueValueForStats)) print(" "*8 + "KS p-value = " + str(kstestpvalueissueValueForStats)) print(" "*8 + "KS Test D = " + str(kstestdcgValueForGraph)) print(" "*8 + "KS p-value = " + str(kstestpvaluecgValueForGraph)) print(" "*8 + "dataset size =" + str(j)) print("--- Class Size <> Issue") print(" "*8 + "Spearman rho correlation coefficient = " + str(spearmanCorrelationCoefficient2)) print(" "*8 + "Spearman p-value = " + str(spearmanpvalue2)) print(" "*8 + "Kendall Tau = " + str(kendalltauCorrelationCoefficient2)) print(" "*8 + "Kendall p-value = " + str(kendalltaupvalue2)) print(" "*8 + "KS Test D = " + str(kstestdchissueSizeValueForStats)) print(" "*8 + "KS p-value = " + str(kstestpvaluechissueSizeValueForStats)) print(" "*8 + "KS Test D = " + str(kstestdclassSizeValueForStats)) print(" "*8 + "KS p-value = " + str(kstestpvalueclassSizeValueForStats)) y = issueForModel X = np.array([callGraphForModel,classSizeForModel]).transpose() X = list([list(i) for i in X]) model = sm.OLS(y, X) results = model.fit() print(results.summary(yname="issues", xname =("APIcalls", "ClassSize"))) else: print("FAILURE : " + appliName) print("|" * 80) print("-" * 80) print("-" * 80) print("|" * 80) issueForGlobalModel = [] callGraphForGlobalModel = [] classSizeForGlobalModel = [] issueGlobalCallgraphValueForStats = [] callGlobalGraphValueForStats = [] NOissueGlobalCallgraphValueForStats = [] issueGlobalSizeValueForStats = [] classGlobalSizeValueForStats = [] anova1issue = [] anova2issue = [] for appliName in finalData: cgscore, issuescore, classSize = finalData[appliName] for key in issuescore: if key in classSize: if key in cgscore: issueForGlobalModel.append(issuescore[key]) callGraphForGlobalModel.append(cgscore[key]) classSizeForGlobalModel.append(issuescore[key]) for key in issuescore: if key in cgscore: j+=1 issueGlobalCallgraphValueForStats.append(issuescore[key]) callGlobalGraphValueForStats.append(cgscore[key]) else: NOissueGlobalCallgraphValueForStats.append(issuescore[key]) for key in cgscore: if key in issuescore: anova1issue.append(cgscore[key]) else: anova2issue.append(cgscore[key]) for key in issuescore: if key in classSize: issueGlobalSizeValueForStats.append(issuescore[key]) classGlobalSizeValueForStats.append(classSize[key]) spearmanGlobalCorrelationCoefficient, spearmanpvalueGlobal = spearmanr(issueGlobalCallgraphValueForStats,callGlobalGraphValueForStats) kendalltauGlobalCorrelationCoefficient, kendalltaupvalueGlobal = kendalltau(issueGlobalCallgraphValueForStats,callGlobalGraphValueForStats) spearmanGlobalCorrelationCoefficient2, spearmanpvalue2Global = spearmanr(issueGlobalSizeValueForStats,classGlobalSizeValueForStats) kendalltauGlobalCorrelationCoefficient2, kendalltaupvalue2Global = kendalltau(issueGlobalSizeValueForStats,classGlobalSizeValueForStats) fvalueanova1, pvalueanova1 = f_oneway(issueGlobalCallgraphValueForStats, NOissueGlobalCallgraphValueForStats) fvalueanova2, pvalueanova2 = f_oneway(anova1issue, anova2issue) print(len(NOissueGlobalCallgraphValueForStats)) print("--- Correlation : API Call <> Issue") print(" "*8 + "Spearman rho correlation coefficient = " + str(spearmanGlobalCorrelationCoefficient)) print(" "*8 + "Spearman p-value = " + str(spearmanpvalueGlobal)) print(" "*8 + "Kendall Tau = " + str(kendalltauGlobalCorrelationCoefficient)) print(" "*8 + "Kendall p-value = " + str(kendalltaupvalueGlobal)) print(" "*8 + "ANOVA F-value = " + str(fvalueanova1)) print(" "*8 + "ANOVA p-value = " + str(pvalueanova1)) print(" "*8 + "ANOVA F-value = " + str(fvalueanova2)) print(" "*8 + "ANOVA p-value = " + str(pvalueanova2)) print("--- Correlation : Class Size <> Issue") print(" "*8 + "Spearman rho correlation coefficient = " + str(spearmanGlobalCorrelationCoefficient2)) print(" "*8 + "Spearman p-value = " + str(spearmanpvalue2Global)) print(" "*8 + "Kendall Tau = " + str(kendalltauGlobalCorrelationCoefficient2)) print(" "*8 + "Kendall p-value = " + str(kendalltaupvalue2Global)) print("_"*80) print("_"*80) print("-- GLOBAL OLS --") y = issueForGlobalModel X = np.array([callGraphForGlobalModel,classSizeForGlobalModel]).transpose() X = list([list(i) for i in X]) X = sm.add_constant(X,prepend=False) model = sm.OLS(y, X) results = model.fit() print(results.summary(yname="issues", xname =("APIcalls", "ClassSize", "const"))) print("API CALLS only") X = callGraphForGlobalModel X = sm.add_constant(X,prepend=False) model2 = sm.OLS(y, X) results = model2.fit() print(results.summary(yname="issues",xname =["APIcalls","const"])) print("Size only") X = classSizeForGlobalModel X = sm.add_constant(X,prepend=False) model3 = sm.OLS(y, X) results = model3.fit() print(results.summary(yname="issues",xname =["ClassSize","const"]))
def run_densityEstimation( functionName, method, kfold=20, numDims=2, numSamples=1000, candidates="join", bandwidthOptimizationType=BandwidthOptimizationType_SILVERMANSRULE, out=True, plot=False, tikz=False): if method == "sgde_zero": interpolation = "zero" else: # interpolation == "boundaries": interpolation = "boundaries" samples, bounds, natafType = load_data_set(functionName, numSamples, numDims) # do kfold cross validation crossEntropyValidation = np.zeros((kfold, 2)) learnSamples, validationSamples = splitset(samples, splitPercentage=0.7) stats = {} for i in range(kfold): print("=" * 100) print("run (%s)= %i/%i" % (method, i + 1, kfold)) print("=" * 100) print("valid: %i x %i (mean=%g, var=%g)" % (validationSamples.shape[0], validationSamples.shape[1], np.mean(validationSamples), np.var(validationSamples))) np.random.seed(i * 123456 + i % 2) trainSamples, testSamples = splitset(learnSamples, splitPercentage=1. - 1. / kfold) if "sgde" in method: dist, stats[i] = estimateSGDEDensity(functionName, trainSamples, testSamples, bounds=bounds, iteration=i, plot=plot, label=method, out=out, candidates=candidates, interpolation=interpolation) elif "kde" in method: dist, stats[i] = estimateKDEDensity( functionName, trainSamples, testSamples, iteration=i, plot=plot, label=method, out=out, bandwidthOptimizationTypeStr=bandwidthOptimizationType) elif "nataf" in method: # estimate nataf density dist, stats[i] = estimateNatafDensity(functionName, natafType, testSamples, iteration=i, bounds=bounds, plot=plot, label=method, out=out) else: raise AttributeError("unknown config '%s'" % method) # evaluate the distribution according to the validation set crossEntropyValidation[i, 0] = i crossEntropyValidation[i, 1] = dist.crossEntropy(validationSamples) stats[i]["crossEntropyValidation"] = dist.crossEntropy( validationSamples) stats[i]["validationSamples"] = validationSamples stats[i]["samples"] = {"shuffled": {}, "not_shuffled": {}} stats[i]["samples"]["shuffled"]["rvs"] = dist.rvs(numSamples, shuffle=True) stats[i]["samples"]["shuffled"]["uniform_validation"] = dist.cdf( validationSamples, shuffle=True) kstests = [None] * numDims for idim in range(numDims): samples1d = stats[i]["samples"]["shuffled"][ "uniform_validation"][:, idim] res_test = kstest(samples1d, Uniform(0, 1).cdf) kstests[idim] = res_test.statistic, res_test.pvalue if plot: plt.figure() plt.hist(samples1d, cumulative=True, normed=True) xs = np.linspace(0, 1, 10) plt.plot(xs, [Uniform(0, 1).cdf(xi) for xi in xs]) plt.title("shuffled: %i, %s" % (idim, kstests[idim])) print("-" * 80) print("shuffled ", kstests, np.min(kstests), np.max(kstests)) if plot: plt.show() stats[i]["samples"]["shuffled"]["kstests"] = kstests stats[i]["samples"]["not_shuffled"]["rvs"] = dist.rvs(numSamples, shuffle=False) stats[i]["samples"]["not_shuffled"]["uniform_validation"] = dist.cdf( validationSamples, shuffle=False) kstests = [None] * numDims for idim in range(numDims): samples1d = stats[i]["samples"]["not_shuffled"][ "uniform_validation"][:, idim] res_test = kstest(samples1d, Uniform(0, 1).cdf) kstests[idim] = res_test.statistic, res_test.pvalue if plot: plt.figure() plt.hist(samples1d, cumulative=True, normed=True) xs = np.linspace(0, 1, 1000) plt.plot(xs, [Uniform(0, 1).cdf(xi) for xi in xs]) plt.title("not shuffled: %i, %s" % (idim, kstests[idim])) print("not shuffled", kstests, np.min(kstests), np.max(kstests)) if plot: plt.show() stats[i]["samples"]["not_shuffled"]["kstests"] = kstests print("CV valid = %g" % crossEntropyValidation[i, 1]) # write results to file if out: out_crossEntropy = os.path.join( "data", method, "%s.%s.validation.cross_entropies.csv" % (method, functionName)) np.savetxt(out_crossEntropy, crossEntropyValidation[:i, :]) # save stats to pickle out_stats = os.path.join( "data", method, "%s.%s.best.stats.pkl" % (method, functionName)) fd = open(out_stats, "w") pkl.dump(stats, fd) fd.close()
#generate sample die roll (100 times) data for 12-sided die rolls = randint(1, 13, 100) print(rolls) print(mean(rolls)) #demonstration of central limit theorem # seed the random number generator seed(1) # calculate the mean of 100 dice rolls 1000 times means = [mean(randint(1, 13, 100)) for _ in range(1000)] # plot the distribution of sample means pyplot.hist(means) plt.xlabel("Frequency") plt.ylabel("Probability Density") plt.title("Histogram plot for 100 rolls 12-sided die") pyplot.show() #Z-score KS test stats.kstest(stats.zscore(means), "norm") #Shapiro-Wilk normailty test from scipy.stats import shapiro stat, p = shapiro(means) print('Statistics={}, p={}'.format(stat, p)) alpha = 0.05 if p > alpha: print('Sample looks Normal so we do not reject H0') else: print('Sample does not look Normal so we reject H0')
import matplotlib.mlab as mlab import matplotlib.pyplot as plt import numpy as np import scipy.optimize from astropy.modeling import models, fitting from scipy.stats import norm, stats plt.clf() f, (ax1, ax2, ax3, ax4) = plt.subplots(4, sharex=True, sharey=True) ## load data fluxes = np.loadtxt("fluxes.dat") # - make statistics p-value test ptest = scipy.stats.mstats.normaltest(fluxes) kstest = stats.kstest(fluxes, 'norm') print(ptest) print(kstest) ## Following http://stackoverflow.com/questions/7805552/fitting-a-histogram-with-python bins = np.linspace(0, 1, 40) n, bins, patches = ax1.hist(fluxes, bins, normed=1, facecolor='green', alpha=0.75) (mu, sigma) = norm.fit(fluxes) y = mlab.normpdf(bins, mu, sigma) l = ax1.plot(bins, y, 'r--', linewidth=2, label='mlab.normpdf')
warnings.simplefilter('ignore') for dist_name in dist_names: dist = getattr(scipy.stats, dist_name) param = dist.fit(df['mean_travel_time']) pdf_fitted = dist.pdf(x, *param[:-2], loc=param[-2], scale=param[-1]) * size plt.plot(pdf_fitted, label=dist_name) plt.xlabel("Trip Duration (Minutes)") plt.ylabel("Frequency") plt.xlim(0, 50) plt.ylim(0, 8000) params[dist_name] = param # Applying the Kolmogorov-Smirnov test D, p = stats.kstest(df['mean_travel_time'], dist_name, args=param) print("p value for " + dist_name + " = " + str(p)) print("D value for " + dist_name + " = " + str(D) + "\n") dist_results.append((dist_name, p)) dist_resultsD.append((dist_name, D)) plt.legend(loc='upper right') plt.show() best_dist, best_p = (max(dist_results, key=lambda item: item[1])) if best_p < 0.001: best_dist, best_D = (min(dist_resultsD, key=lambda item: item[1])) # store the name of the best fit and its p value print("Best fitting distribution: " + str(best_dist))
def main(): finaldatafile = "finaldata.json" finalData = None try: with open(finaldatafile) as data_file: finalData = json.load(data_file) except: print("Run analysis") exit() for appliName in finalData: cgscore, issuescore, classSize = finalData[appliName] j = 0 issueCallgraphValueForStats = [] callGraphValueForStats = [] issueSizeValueForStats = [] classSizeValueForStats = [] issueForModel = [] callGraphForModel = [] classSizeForModel = [] for key in issuescore: if key in cgscore: j += 1 issueCallgraphValueForStats.append(issuescore[key]) callGraphValueForStats.append(cgscore[key]) for key in issuescore: if key in classSize: issueSizeValueForStats.append(issuescore[key]) classSizeValueForStats.append(classSize[key]) for key in issuescore: if key in classSize: if key in cgscore: issueForModel.append(issuescore[key]) callGraphForModel.append(cgscore[key]) classSizeForModel.append(classSize[key]) if j > 3: spearmanCorrelationCoefficient, spearmanpvalue = spearmanr( issueCallgraphValueForStats, callGraphValueForStats) kendalltauCorrelationCoefficient, kendalltaupvalue = kendalltau( issueCallgraphValueForStats, callGraphValueForStats) kstestdissueValueForStats, kstestpvalueissueValueForStats = kstest( [issuescore[key] for key in issuescore], "norm") kstestdcgValueForGraph, kstestpvaluecgValueForGraph = kstest( [cgscore[key] for key in cgscore], "norm") spearmanCorrelationCoefficient2, spearmanpvalue2 = spearmanr( issueSizeValueForStats, classSizeValueForStats) kendalltauCorrelationCoefficient2, kendalltaupvalue2 = kendalltau( issueSizeValueForStats, classSizeValueForStats) kstestdchissueSizeValueForStats, kstestpvaluechissueSizeValueForStats = kstest( [issuescore[key] for key in issuescore], "norm") kstestdclassSizeValueForStats, kstestpvalueclassSizeValueForStats = kstest( [classSize[key] for key in classSize], "norm") print(appliName) print("--- API Call <> Issue") print(" " * 8 + "Spearman rho correlation coefficient = " + str(spearmanCorrelationCoefficient)) print(" " * 8 + "Spearman p-value = " + str(spearmanpvalue)) print(" " * 8 + "Kendall Tau = " + str(kendalltauCorrelationCoefficient)) print(" " * 8 + "Kendall p-value = " + str(kendalltaupvalue)) print(" " * 8 + "KS Test D = " + str(kstestdissueValueForStats)) print(" " * 8 + "KS p-value = " + str(kstestpvalueissueValueForStats)) print(" " * 8 + "KS Test D = " + str(kstestdcgValueForGraph)) print(" " * 8 + "KS p-value = " + str(kstestpvaluecgValueForGraph)) print(" " * 8 + "dataset size =" + str(j)) print("--- Class Size <> Issue") print(" " * 8 + "Spearman rho correlation coefficient = " + str(spearmanCorrelationCoefficient2)) print(" " * 8 + "Spearman p-value = " + str(spearmanpvalue2)) print(" " * 8 + "Kendall Tau = " + str(kendalltauCorrelationCoefficient2)) print(" " * 8 + "Kendall p-value = " + str(kendalltaupvalue2)) print(" " * 8 + "KS Test D = " + str(kstestdchissueSizeValueForStats)) print(" " * 8 + "KS p-value = " + str(kstestpvaluechissueSizeValueForStats)) print(" " * 8 + "KS Test D = " + str(kstestdclassSizeValueForStats)) print(" " * 8 + "KS p-value = " + str(kstestpvalueclassSizeValueForStats)) y = issueForModel X = np.array([callGraphForModel, classSizeForModel]).transpose() X = list([list(i) for i in X]) model = sm.OLS(y, X) results = model.fit() print( results.summary(yname="issues", xname=("APIcalls", "ClassSize"))) else: print("FAILURE : " + appliName) print("|" * 80) print("-" * 80) print("-" * 80) print("|" * 80) issueForGlobalModel = [] callGraphForGlobalModel = [] classSizeForGlobalModel = [] issueGlobalCallgraphValueForStats = [] callGlobalGraphValueForStats = [] NOissueGlobalCallgraphValueForStats = [] issueGlobalSizeValueForStats = [] classGlobalSizeValueForStats = [] anova1issue = [] anova2issue = [] for appliName in finalData: cgscore, issuescore, classSize = finalData[appliName] for key in issuescore: if key in classSize: if key in cgscore: issueForGlobalModel.append(issuescore[key]) callGraphForGlobalModel.append(cgscore[key]) classSizeForGlobalModel.append(issuescore[key]) for key in issuescore: if key in cgscore: j += 1 issueGlobalCallgraphValueForStats.append(issuescore[key]) callGlobalGraphValueForStats.append(cgscore[key]) else: NOissueGlobalCallgraphValueForStats.append(issuescore[key]) for key in cgscore: if key in issuescore: anova1issue.append(cgscore[key]) else: anova2issue.append(cgscore[key]) for key in issuescore: if key in classSize: issueGlobalSizeValueForStats.append(issuescore[key]) classGlobalSizeValueForStats.append(classSize[key]) spearmanGlobalCorrelationCoefficient, spearmanpvalueGlobal = spearmanr( issueGlobalCallgraphValueForStats, callGlobalGraphValueForStats) kendalltauGlobalCorrelationCoefficient, kendalltaupvalueGlobal = kendalltau( issueGlobalCallgraphValueForStats, callGlobalGraphValueForStats) spearmanGlobalCorrelationCoefficient2, spearmanpvalue2Global = spearmanr( issueGlobalSizeValueForStats, classGlobalSizeValueForStats) kendalltauGlobalCorrelationCoefficient2, kendalltaupvalue2Global = kendalltau( issueGlobalSizeValueForStats, classGlobalSizeValueForStats) fvalueanova1, pvalueanova1 = f_oneway(issueGlobalCallgraphValueForStats, NOissueGlobalCallgraphValueForStats) fvalueanova2, pvalueanova2 = f_oneway(anova1issue, anova2issue) print(len(NOissueGlobalCallgraphValueForStats)) print("--- Correlation : API Call <> Issue") print(" " * 8 + "Spearman rho correlation coefficient = " + str(spearmanGlobalCorrelationCoefficient)) print(" " * 8 + "Spearman p-value = " + str(spearmanpvalueGlobal)) print(" " * 8 + "Kendall Tau = " + str(kendalltauGlobalCorrelationCoefficient)) print(" " * 8 + "Kendall p-value = " + str(kendalltaupvalueGlobal)) print(" " * 8 + "ANOVA F-value = " + str(fvalueanova1)) print(" " * 8 + "ANOVA p-value = " + str(pvalueanova1)) print(" " * 8 + "ANOVA F-value = " + str(fvalueanova2)) print(" " * 8 + "ANOVA p-value = " + str(pvalueanova2)) print("--- Correlation : Class Size <> Issue") print(" " * 8 + "Spearman rho correlation coefficient = " + str(spearmanGlobalCorrelationCoefficient2)) print(" " * 8 + "Spearman p-value = " + str(spearmanpvalue2Global)) print(" " * 8 + "Kendall Tau = " + str(kendalltauGlobalCorrelationCoefficient2)) print(" " * 8 + "Kendall p-value = " + str(kendalltaupvalue2Global)) print("_" * 80) print("_" * 80) print("-- GLOBAL OLS --") y = issueForGlobalModel X = np.array([callGraphForGlobalModel, classSizeForGlobalModel]).transpose() X = list([list(i) for i in X]) X = sm.add_constant(X, prepend=False) model = sm.OLS(y, X) results = model.fit() print( results.summary(yname="issues", xname=("APIcalls", "ClassSize", "const"))) print("API CALLS only") X = callGraphForGlobalModel X = sm.add_constant(X, prepend=False) model2 = sm.OLS(y, X) results = model2.fit() print(results.summary(yname="issues", xname=["APIcalls", "const"])) print("Size only") X = classSizeForGlobalModel X = sm.add_constant(X, prepend=False) model3 = sm.OLS(y, X) results = model3.fit() print(results.summary(yname="issues", xname=["ClassSize", "const"]))
def pvalue(self): a, b = self.bounds normalized = (self.samples - a) / (b - a) K, pvalue = kstest(normalized, 'uniform') #@UnusedVariable return pvalue
x = np.random.normal(mu, sigma, 100) print(mean(x)) pyplot.hist(x) pyplot.show() norm=[np.mean(np.random.normal(mu, sigma, 100)) for _i in range(1000)] print(norm) #histogram plot pyplot.hist(norm) plt.xlabel("Frequency") plt.ylabel("Probability Density") plt.title ("Histogram plot for 100 samples from Random Normal Distribution") pyplot.show() #Z-score KS test stats.kstest(stats.zscore(norm), "norm") #poisson seed (2) t=np.random.poisson(lam=3,size=(100)) print(mean(t)) pyplot.hist(t) plt.xlabel("Frequency") plt.ylabel("Probability Density") plt.title ("Histogram plot for Random Poisson Distribution of 100 samples (lambda=3)") pyplot.show() s = [np.mean(np.random.poisson(3,100)) for _i in range(1000)] pyplot.hist(s) plt.xlabel("Frequency")
import numpy as np from numpy import var, std from scipy.stats import stats from statistic.check.util import s_2, sigma_2 __author__ = 'zzt' if __name__ == '__main__': l = [420, 500, 920, 1380, 1510, 1650, 1760, 2100, 2300, 2350] print(stats.kstest(l, 'expon', [1500.0])) x = np.linspace(-15, 15, 9) print(std(x)**2) print(var(x)) print(s_2(x)) print(sigma_2(x)) print(stats.kstest(x, 'norm', [0, 9]))
def def_kstest(rvs1, cdf1, args1=(), alternative1='two-sided'): res = kstest(rvs=rvs1, cdf=cdf1, args=args1, alternative=alternative1) return res
params = {} with warnings.catch_warnings(): warnings.simplefilter('ignore') for dist_name in dist_names: dist = getattr(scipy.stats, dist_name) param = dist.fit(df['count']) pdf_fitted = dist.pdf(x, *param[:-2], loc=param[-2], scale=param[-1]) * size plt.plot(pdf_fitted, label=dist_name) plt.xlim(0,50) plt.ylim(0,500) params[dist_name] = param # Applying the Kolmogorov-Smirnov test D, p = stats.kstest(df['count'], dist_name, args=param) print("p value for "+dist_name+" = "+str(p)) dist_results.append((dist_name, p)) plt.legend(loc='upper right') plt.show() best_dist, best_p = (max(dist_results, key=lambda item: item[1])) # store the name of the best fit and its p value print("Best fitting distribution: "+str(best_dist)) print("Best p value: "+ str(best_p)) print("Parameters for the best fit: "+ str(params[best_dist]))