def __init__(self, filename, sep=",", skip=0, index=True, header=True): if type(filename) == str: allm = gendata(filename, sep=sep, skip=skip, index=index, header=header) if index==True: self.date = allm[0] self.data = allm[1] self.header = allm[2] else: self.data = allm[0] self.header = allm[1] self.date = "none" self.data = self.data.transpose() else: self.data = filename self.date = index self.header = header self.N,self.T = self.data.shape self.mean = [sps.tmean(i) for i in self.data] if self.N > 1: self.variance = [sps.tvar(i) for i in self.data] else: self.variance = [sps.tvar(self.data)] self.mean = np.array(self.mean) self.variance = np.array(self.variance) self.covariance = np.zeros((self.N, self.N)) self.correlation = np.zeros((self.N, self.N)) self.skewness = 0 self.kurtosis = 0 self.dmean = (self.data.transpose()-self.mean).transpose() self.did_covar = False self.JB = 0 self.JBpvalue = 0
def query4(self, length=8): global data1 data1=pandas.read_sql_query(query['4a'], cnx) pysql = lambda q: pandasql.sqldf(q, globals()) data1_rep = pysql("select p_id as \"Patient ID\",exp as \"Expression Val\" from data1 ") global data2 data2=pandas.read_sql_query(query['4b'], cnx) data2_rep = pysql("select p_id as \"Patient ID\",exp as \"Expression Val\" from data2 ") a=data1['exp'].values b=data2['exp'].values print(stats.tmean(a)) print(stats.tmean(b)) print(stats.tvar(a)) print(stats.tvar(b)) return """<html> <form method="get" action="index"> <button type="submit">Return</button> </form> </form> <form method="post" action="processQuery4"> Custom Query on Result: <input type="text" name="qu"><br> <input type="submit"> </form> <h2>T-statistics for Exp Values::</h2>"""+(str)(stats.ttest_ind(a,b,equal_var=True)[0])+""" <h1>Exp values for patients with ALL<h3>(Rows-"""+str(len(data1.index))+""")</h3></h1>"""+data1_rep.to_html(index=False)+""" <h1>Exp values for patients without ALL<h3>(Rows-"""+str(len(data2.index))+""")</h3></h1>"""+data2_rep.to_html(index=False)+"""
def test_tvar(self): for n in self.get_n(): x, y, xm, ym = self.generate_xy_sample(n) assert_almost_equal(stats.tvar(x), stats.mstats.tvar(xm), decimal=12) assert_almost_equal(stats.tvar(y), stats.mstats.tvar(ym), decimal=12)
def get_12ECG_features(data, header_data): tmp_hea = header_data[0].split(' ') ptID = tmp_hea[0] num_leads = int(tmp_hea[1]) sample_Fs = int(tmp_hea[2]) gain_lead = np.zeros(num_leads) for ii in range(num_leads): tmp_hea = header_data[ii + 1].split(' ') gain_lead[ii] = int(tmp_hea[2].split('/')[0]) # for testing, we included the mean age of 57 if the age is a NaN # This value will change as more data is being released for iline in header_data: if iline.startswith('#Age'): tmp_age = iline.split(': ')[1].strip() age = int(tmp_age if tmp_age != 'NaN' else 57) elif iline.startswith('#Sex'): tmp_sex = iline.split(': ')[1] if tmp_sex.strip() == 'Female': sex = 1 else: sex = 0 # elif iline.startswith('#Dx'): # label = iline.split(': ')[1].split(',')[0] # We are only using data from lead1 peaks, idx = detect_peaks(data[0], sample_Fs, gain_lead[0]) # mean mean_RR = np.mean(idx / sample_Fs * 1000) mean_Peaks = np.mean(peaks * gain_lead[0]) # median median_RR = np.median(idx / sample_Fs * 1000) median_Peaks = np.median(peaks * gain_lead[0]) # standard deviation std_RR = np.std(idx / sample_Fs * 1000) std_Peaks = np.std(peaks * gain_lead[0]) # variance var_RR = stats.tvar(idx / sample_Fs * 1000) var_Peaks = stats.tvar(peaks * gain_lead[0]) # Skewness skew_RR = stats.skew(idx / sample_Fs * 1000) skew_Peaks = stats.skew(peaks * gain_lead[0]) # Kurtosis kurt_RR = stats.kurtosis(idx / sample_Fs * 1000) kurt_Peaks = stats.kurtosis(peaks * gain_lead[0]) features = np.hstack([ age, sex, mean_RR, mean_Peaks, median_RR, median_Peaks, std_RR, std_Peaks, var_RR, var_Peaks, skew_RR, skew_Peaks, kurt_RR, kurt_Peaks ]) return features
def query4(self): global data1 data1 = pandas.read_sql_query(query['4a'], cnx) global data2 data2 = pandas.read_sql_query(query['4b'], cnx) a = data1['Expression Val'].values b = data2['Expression Val'].values print(stats.tmean(a)) print(stats.tmean(b)) print(stats.tvar(a)) print(stats.tvar(b)) tt = stats.ttest_ind(a, b, equal_var=True) return """<html> <form method="get" action="index"> <button type="submit">Return</button> </form> </form> <form method="post" action="processQuery4"> Custom Query on Result: <input type="text" name="qu"><br> <input type="submit"> </form> <h2>T-statistics for Exp Values::</h2>""" + (str)(tt[0]) + """ <h2>Corresponding p-value::</h2>""" + (str)(tt[1]) + """ <h1>Exp values for patients with ALL<h3>(Rows-""" + str(len( data1.index)) + """)</h3></h1>""" + data1.to_html( index=False) + """ <h1>Exp values for patients without ALL<h3>(Rows-""" + str( len(data2.index)) + """)</h3></h1>""" + data2.to_html( index=False) + """
def tableauMAJ(u, v, w, x, tableau_final): sigma1 = np.sqrt(stats.tvar(data["GasCum360"])) sigma2 = np.sqrt(stats.tvar(data["OilCum360"])) tableau_final["Gas360_SUP"] = tableau_final["GasCum360"] + u * sigma1 tableau_final["Gas360_INF"] = tableau_final["GasCum360"] - v * sigma1 tableau_final["Oil360_SUP"] = tableau_final["OilCum360"] + w * sigma2 tableau_final["Oil360_INF"] = tableau_final["OilCum360"] - x * sigma2 return tableau_final
def chi_2(list, alf=.95): upper = ((len(list) - 1) * stats.tvar(list)) / (stats.chi2.ppf( alf / 2, len(list) - 1)) lower = ((len(list) - 1) * stats.tvar(list)) / (stats.chi2.ppf( 1 - (alf / 2), len(list) - 1)) return (lower, upper)
def get_file_features(data, header_data): age = header_data[0] sex = header_data[1] sample_Fs = header_data[2] num_leads = len(header_data) - 3 gain_lead = np.zeros(num_leads) for ii in range(num_leads): gain_lead[ii] = header_data[3 + ii] for i in range(0, (len(data))): peaks, idx = detect_peaks(data[i], sample_Fs, gain_lead[i]) # mean mean_RR = np.mean(idx / sample_Fs * 1000) mean_Peaks = np.mean(peaks * gain_lead[i]) # median median_RR = 0 median_Peaks = np.median(peaks * gain_lead[i]) # standard deviation std_RR = np.std(idx / sample_Fs * 1000) std_Peaks = np.std(peaks * gain_lead[i]) # variance var_RR = stats.tvar(idx / sample_Fs * 1000) var_Peaks = stats.tvar(peaks * gain_lead[i]) # Skewness skew_RR = stats.skew(idx / sample_Fs * 1000) skew_Peaks = stats.skew(peaks * gain_lead[i]) # Kurtosis kurt_RR = stats.kurtosis(idx / sample_Fs * 1000) kurt_Peaks = stats.kurtosis(peaks * gain_lead[i]) curfeatures = np.vstack([ mean_RR, mean_Peaks, median_RR, median_Peaks, std_RR, std_Peaks, var_RR, var_Peaks, skew_RR, skew_Peaks, kurt_RR, kurt_Peaks ]) j = 0 for j in range(0, (len(curfeatures))): if np.isnan(curfeatures[j]): curfeatures[j] = 0 if i == 0: lead_features_tmp = curfeatures else: tmp = np.row_stack((lead_features_tmp, curfeatures)) lead_features_tmp = tmp return lead_features_tmp
def main(): f27_scan = open('sim_scan27.txt', 'r') f27_table = open('sim_table27.txt', 'r') f35932_scan = open('sim_scan35932.txt', 'r') f35932_table = open('sim_table35932.txt', 'r') ntests = 10 scan27 = [0 for i in range(ntests)] table27 = [0 for i in range(ntests)] scan35932 = [0 for i in range(ntests)] table35932 = [0 for i in range(ntests)] files = [f27_scan, f27_table, f35932_scan, f35932_table] arrs = [scan27, table27, scan35932, table35932] for i in range(ntests): for j in range(4): line = files[j].readline() if line[len(line)-1] == '\n': line = line[:len(line)-1] arrs[j][i] = float(line) for j in range(4): files[j].close() _, p27 = stats.ttest_ind(scan27, table27, equal_var=False) mean27scan = stats.tmean(scan27) mean27table = stats.tmean(table27) var27scan = stats.tvar(scan27) var27table = stats.tvar(table27) _, p35932 = stats.ttest_ind(scan35932, table35932, equal_var=False) mean35932scan = stats.tmean(scan35932) mean35932table = stats.tmean(table35932) var35932scan = stats.tvar(scan35932) var35932table = stats.tvar(table35932) f = open('sim_results_compare_scan_table.txt', 'w') f.write('27\n') f.write('scan mean: ' + str(mean27scan) + '\n') f.write('scan var: ' + str(var27scan) + '\n') f.write('table mean: ' + str(mean27table) + '\n') f.write('table var: ' + str(var27table) + '\n') f.write('p-value: ' + str(p27) + '\n\n') f.write('35932\n') f.write('scan mean: ' + str(mean35932scan) + '\n') f.write('scan var: ' + str(var35932scan) + '\n') f.write('table mean: ' + str(mean35932table) + '\n') f.write('table var: ' + str(var35932table) + '\n') f.write('p-value: ' + str(p35932) + '\n') f.close()
def process_CustomTstat(self, disease1, disease2, go): print(disease2) q1 = "SELECT s.p_id,maf.exp from GOAnnotation ga inner join (probe pb,Diagnosis dg,disease ds, microarray_fact maf, sample s) on (ga.UID = pb.UID and pb.pb_id=maf.pb_id and dg.ds_id= ds.ds_id and dg.p_id=s.p_id and maf.s_id= s.s_id) where ga.go_id =\"" + go + "\"and ds.`name` =\"" + disease1 + "\"" q1_not = "SELECT s.p_id,maf.exp from GOAnnotation ga inner join (probe pb,Diagnosis dg,disease ds, microarray_fact maf, sample s) on (ga.UID = pb.UID and pb.pb_id=maf.pb_id and dg.ds_id= ds.ds_id and dg.p_id=s.p_id and maf.s_id= s.s_id) where ga.go_id =\"" + go + "\"and ds.`name` !=\"" + disease1 + "\"" data1 = pandas.read_sql_query(q1, cnx) data1_not = pandas.read_sql_query(q1_not, cnx) a = data1['exp'].values b = data1_not['exp'].values print(stats.tmean(a)) print(stats.tmean(b)) print(stats.tvar(a)) print(stats.tvar(b)) if disease1 == disease2: tt = stats.ttest_ind(a, b, equal_var=True) return """<html> <form method="get" action="index"> <button type="submit">Return</button> </form> </form> <h2>T-statistics for Exp Values::</h2>""" + (str)(tt[0]) + """ <h2>Corresponding p-value::</h2>""" + (str)(tt[1]) + """ <h1>Exp values for patients with """ + disease1 + """<h3>(Rows-""" + str( len(data1.index) ) + """)</h3></h1>""" + data1.to_html(index=False) + """ <h1>Exp values for patients without """ + disease1 + """<h3>(Rows-""" + str( len(data1_not.index)) + """)</h3></h1>""" + data1_not.to_html( index=False) + """ </html>""" else: q2 = "SELECT s.p_id,maf.exp from GOAnnotation ga inner join (probe pb,Diagnosis dg,disease ds, microarray_fact maf, sample s) on (ga.UID = pb.UID and pb.pb_id=maf.pb_id and dg.ds_id= ds.ds_id and dg.p_id=s.p_id and maf.s_id= s.s_id) where ga.go_id =\"" + go + "\"and ds.`name` =\"" + disease2 + "\"" data2 = pandas.read_sql_query(q2, cnx) b = data2['exp'].values print(stats.tmean(a)) print(stats.tmean(b)) print(stats.tvar(a)) print(stats.tvar(b)) tt = stats.ttest_ind(a, b, equal_var=True) return """<html> <form method="get" action="index"> <button type="submit">Return</button> </form> </form> <h2>T-statistics for Exp Values::</h2>""" + (str)(tt[0]) + """ <h2>Corresponding p-value::</h2>""" + (str)(tt[1]) + """ <h1>Exp values for patients with """ + disease1 + """<h3>(Rows-""" + str( len(data1.index)) + """)</h3></h1>""" + data1.to_html( index=False) + """ <h1>Exp values for patients with """ + disease2 + """<h3>(Rows-""" + str( len(data2.index)) + """)</h3></h1>""" + data2.to_html( index=False) + """
def transform_input_shapes(matrix_as_array): bit = BasicImageTransformations(matrix_as_array) matrices_inverted = np.invert(bit.matrix_skeletonized) components = measure.label(matrices_inverted, connectivity=1, return_num=True)[1] image_pixels = convert_to_pixels_list(bit.matrix_skeletonized) image_pixels_as_pair_of_lists = transform_pixels_to_pair_of_lists( image_pixels) pixels_x = extract_dimension(image_pixels, 1) pixels_y = extract_dimension(image_pixels, 0) min_x = min(pixels_x) max_x = max(pixels_x) min_y = min(pixels_y) max_y = max(pixels_y) mean_x = stats.tmean(pixels_x) mean_y = stats.tmean(pixels_y) variance_x = stats.tvar(pixels_x) variance_y = stats.tvar(pixels_y) correlation = stats.pearsonr(image_pixels_as_pair_of_lists[0], image_pixels_as_pair_of_lists[1])[0] xxy = transform_pixels_with_function( image_pixels, lambda pixel: pixel[0] * pixel[1] * pixel[1]) xyy = transform_pixels_with_function( image_pixels, lambda pixel: pixel[0] * pixel[0] * pixel[1]) mean_xxy = stats.tmean(xxy) mean_xyy = stats.tmean(xyy) xy_tr2 = transform_pixels_with_function( image_pixels, lambda pixel: pixel[0] * pixel[1] * np.sin(pixel[ 0] / 2.0) * np.sin(pixel[1] / 2.0)) xy_tr4 = transform_pixels_with_function( image_pixels, lambda pixel: pixel[0] * pixel[1] * np.sin(pixel[ 0] / 4.0) * np.sin(pixel[1] / 4.0)) mean_xy_tr2 = stats.tmean(xy_tr2) mean_xy_tr4 = stats.tmean(xy_tr4) print(mean_xy_tr2) print(mean_xy_tr4) edges = feature.canny(bit.matrix_float) edges_for_y = edges.sum(axis=1) edges_for_x = edges.sum(axis=0) avg_egdes_for_y = average_of_non_zero_elements(edges_for_y) avg_egdes_for_x = average_of_non_zero_elements(edges_for_x) skew_x = stats.skew(pixels_x) skew_y = stats.skew(pixels_y) features = np.array([ components, min_x, max_x, min_y, max_y, mean_x, mean_y, variance_x, variance_y, correlation, mean_xxy, mean_xyy, mean_xy_tr2, mean_xy_tr4, avg_egdes_for_x, avg_egdes_for_y, skew_x, skew_y ]) return features
def calc(data_X,data_Y): "This function is calculate the mean ,variance, correlation,liner regression" print "x mean: ", np.mean(data_X) # Mean for X print "x variance: ", tvar(data_X) # Sample Variance for X print "y mean: ", np.mean(data_Y) # Mean for Y print "y variance: ", tvar(data_Y) # Sample Variance for Y print "x and y correlation coefficient: ", pearsonr(np.array(data_X),np.array(data_Y))[0] # Build the liner model liner regression regr = linear_model.LinearRegression() # fit regr.fit(np.array(data_X).reshape(-1, 1), np.array(data_Y).reshape(-1, 1)) a, b = regr.coef_, regr.intercept_ print("liner regresstion: y=%.2fx+%.2f" %(a,b))
def Quest4(): global query global cnx data=pandas.read_sql_query(query['4a'], cnx) data2=pandas.read_sql_query(query['4b'], cnx) a=data['exp'].values b=data2['exp'].values print(stats.tmean(a)) print(stats.tmean(b)) print(stats.tvar(a)) print(stats.tvar(b)) print(stats.ttest_ind(a,b,equal_var=True)) return
def find_stats(df, dist_type, probNum=None): means = [] sample_num = len(df.columns) # AKA cases sample_size = len(df) # AKA samples per case dist_type = dist_type + ' Distribution - ' + str( sample_num) + ' Cases that sample ' + str(sample_size) + ' numbers' print('number of cases:\t', sample_num, '\nsamples per case:\t', sample_size) print('\n') if probNum: temp = str(probNum * 100) + '%' print('Probablity of Binomial distribution:\t', temp) # loop through each sample and find its respective mean for i in range(0, sample_num): mean = round(df.iloc[0:, i].mean(), 3) # mean of sample size from generated random values means.append(mean) if sample_num == 1: # there is one sample & we need more than 1 value to calculate std_dev # std_dev = round( df.iloc[0:, i].std(), 3) # old way to calculate std_dev for just 1 sample mean_of_means = means[0] std_dev = round(df.iloc[0:, i].mean().std(), 3) variance = round(df.iloc[0:, i].mean().var(), 3) skewness = round(df.iloc[0:, i].skew(), 3) kurtosis = round(df.iloc[0:, i].kurtosis(), 3) # Turn array of means into a numpy array numpy_means = np.array(means) fig = plt.subplot() fig.hist(numpy_means, bins=50, range=[0, 1], histtype='bar') fig.set_xlabel('Mean (Value)') fig.set_ylabel('Value Frequency') fig.set_title(dist_type) plt.show() return means, mean_of_means, variance, std_dev, skewness, kurtosis else: # Turn array of means into a numpy array numpy_means = np.array(means) fig = plt.subplot() fig.hist(numpy_means, bins=50, range=[0, 1], histtype='bar') fig.set_xlabel('Mean (Value)') fig.set_ylabel('Value Frequency') fig.set_title(dist_type) plt.show() mean_of_means = round(numpy_means.mean(), 3) variance = round(stats.tvar(means), 3) std_dev = round(stdev(means), 3) skewness = round(stats.skew(means), 3) kurtosis = round(stats.kurtosis(means), 3) return means, mean_of_means, variance, std_dev, skewness, kurtosis
def zero_var(var_list,df,threshold): thresh = 0.00 for col in var_list: if (stats.tvar(df[col]) == threshold) or (np.percentile(df[col],90) == 0.00): var_list.remove(col) return var_list
def GFPFeatureCreation(tempG): print("Starting Feature Creation") # Create vertex * feature matrix # Loop through all the vertices and extract the vertices and attributes then all to a list featuresCollection = [[], [], [], [], [], []] f = [] for v in tempG.vertices(): featuresCollection[0].append(tempG.vp.dp[v]) featuresCollection[1].append(tempG.vp.lc[v]) featuresCollection[2].append(tempG.vp.tHN[v]) featuresCollection[3].append(tempG.vp.nCCP[v]) featuresCollection[4].append(tempG.vp.pR[v]) featuresCollection[5].append(tempG.vp.eV[v]) for i in range(6): median = numpy.median(featuresCollection[i]) mean = numpy.mean(featuresCollection[i]) stdev = numpy.std(featuresCollection[i]) skewness = stats.skew(featuresCollection[i]) kurtosis = stats.kurtosis(featuresCollection[i]) variance = stats.tvar(featuresCollection[i]) maxVal = stats.tmax(featuresCollection[i]) minVal = stats.tmin(featuresCollection[i]) f += [ median, mean, stdev, skewness, kurtosis, variance, maxVal, minVal ] return f
def GFPFeatureCreation(tempG): print("Starting Feature Creation") # Create vertex * feature matrix # Loop through all the vertices and extract the vertices and attributes then all to a list featuresCollection = [] f = [] featuresCollection.append(np.array(np.nan_to_num(tempG.vp.dp.a))) featuresCollection.append(np.array(np.nan_to_num(tempG.vp.lc.a))) featuresCollection.append(np.array(np.nan_to_num(tempG.vp.tHN.a))) featuresCollection.append(np.array(np.nan_to_num(tempG.vp.nCCP.a))) featuresCollection.append(np.array(np.nan_to_num(tempG.vp.pR.a))) featuresCollection.append(np.array(np.nan_to_num(tempG.vp.eV.a))) for i in range(6): median = np.median(featuresCollection[i]) mean = np.mean(featuresCollection[i]) stdev = np.std(featuresCollection[i]) skewness = stats.skew(featuresCollection[i]) kurtosis = stats.kurtosis(featuresCollection[i]) variance = stats.tvar(featuresCollection[i]) maxVal = stats.tmax(featuresCollection[i]) minVal = stats.tmin(featuresCollection[i]) f += [median, mean, stdev, skewness, kurtosis, variance, maxVal, minVal] return f
def ss_within(cls, *args): """ Get the sum of square deviations of each value compared to its group mean value """ try: return sum((len(a)-1)*stats.tvar(a) for a in args) except: raise TypeError('Expected only lists or tuples')
def GFPFeatureCreation(tempG): print("Starting Feature Creation") # Create vertex * feature matrix # Loop through all the vertices and extract the vertices and attributes then all to a list featuresCollection = [ [], [], [], [], [], [] ] f = [] for v in tempG.vertices(): featuresCollection[0].append(tempG.vp.dp[v]) featuresCollection[1].append(tempG.vp.lc[v]) featuresCollection[2].append(tempG.vp.tHN[v]) featuresCollection[3].append(tempG.vp.nCCP[v]) featuresCollection[4].append(tempG.vp.pR[v]) featuresCollection[5].append(tempG.vp.eV[v]) for i in range(6): median = numpy.median(featuresCollection[i]) mean = numpy.mean(featuresCollection[i]) stdev = numpy.std(featuresCollection[i]) skewness = stats.skew(featuresCollection[i]) kurtosis = stats.kurtosis(featuresCollection[i]) variance = stats.tvar(featuresCollection[i]) maxVal = stats.tmax(featuresCollection[i]) minVal = stats.tmin(featuresCollection[i]) f += [median, mean, stdev, skewness, kurtosis, variance, maxVal, minVal] return f
def _badPixMap(self, clip=30, filename='badpix.dmp'): median = np.median(self.image) var = tvar(self.image, (-100, 100)) self.badpix = ma.masked_greater(self.image - median, clip * np.sqrt(var)) if filename is not None: self.badpix.dump(filename)
def overlap_variance(resList, anchors, rad, world_size, excludeDesert=True): niches = niche_analysis(resList, anchors, rad, world_size, excludeDesert) vals = [] for key in niches.keys(): vals.append(len(key)*(niches[key]/float(world_size*world_size))) return stats.tvar(vals)
def ownCorrelationMeasure(self, X, Y): # Group X-values into categories with their respective set of Y-values groups = {} for i in range(len(X)): key = X[i] value = Y[i] if key in groups: groups[key] += [value] else: groups[key] = [value] # Calculate normal distribution for every X-value normal_distributions = {} #normal_distributions_old = {} for x in groups.keys(): #normal_distributions_old[x] = stats.norm.fit(groups[x]) if len(groups[x]) > 1: normal_distributions[x] = (stats.tmean(groups[x]), stats.tvar(groups[x])) else: normal_distributions[x] = (groups[x][0], 0) # Calculate correlation measure max_dist = max(normal_distributions.values()) min_dist = min(normal_distributions.values()) correlation = max_dist[0]/min_dist[0] # Ratio between mean for max and min return [correlation, normal_distributions]
def test_calculate_variance(self): sample = [] for i in range(0, 100): sample.append(random()) var = self.stat.calculate_variance(sample, self.stat.calculate_mean(sample)) control = tvar(sample) self.assertAlmostEqual(var, control)
def print_and_plot_results(count, results, verbose, plot_file_name): print("RPS calculated as 95% confidence interval") rps_mean_ar = [] low_ar = [] high_ar = [] test_name_ar = [] for test_name in sorted(results): data = results[test_name] rps = count / array(data) rps_mean = tmean(rps) rps_var = tvar(rps) low, high = norm.interval(0.95, loc=rps_mean, scale=rps_var**0.5) times = array(data) * 1000000 / count times_mean = tmean(times) times_stdev = tstd(times) print('Results for', test_name) print('RPS: {:d}: [{:d}, {:d}],\tmean: {:.3f} μs,' '\tstandard deviation {:.3f} μs' .format(int(rps_mean), int(low), int(high), times_mean, times_stdev)) test_name_ar.append(test_name) rps_mean_ar.append(rps_mean) low_ar.append(low) high_ar.append(high) if verbose: print(' from', times) print() if plot_file_name is not None: import matplotlib.pyplot as plt from matplotlib import cm fig = plt.figure() ax = fig.add_subplot(111) L = len(rps_mean_ar) color = [cm.autumn(float(c) / (L - 1)) for c in arange(L)] bars = ax.bar( arange(L), rps_mean_ar, color=color, yerr=(low_ar, high_ar), ecolor='k') # order of legend is reversed for visual appeal ax.legend( reversed(bars), reversed(test_name_ar), loc='upper left') ax.get_xaxis().set_visible(False) plt.ylabel('Requets per Second', fontsize=16) print(plot_file_name) plt.savefig(plot_file_name, dpi=96) print("Plot is saved to {}".format(plot_file_name)) if verbose: plt.show()
def sasha_chisquared(expec, observ): tempnum = (expec - observ)**2 thevar = stats.tvar(observ) MeanSquaredError = np.sum(tempnum) / (len(expec) - 2) RootMeanSquaredError = np.sqrt(MeanSquaredError) tempnum /= thevar tempdenom = (len(expec) - 2) tempreturn = np.sum(tempnum) / tempdenom return [tempreturn, RootMeanSquaredError]
def sasha_chisquared(expec, observ, degree_of_freedom=2): tempnum = (expec - observ)**2 thevar = stats.tvar(observ) MeanSquaredError = np.sum(tempnum) / (len(expec)-2) RootMeanSquaredError = np.sqrt(MeanSquaredError) tempnum /= thevar tempdenom = (len(expec)-degree_of_freedom) tempreturn = np.sum(tempnum)/tempdenom return [tempreturn,RootMeanSquaredError]
def sasha_slope_error(expec, observ, x_observ): tempnum = (expec - observ)**2 tempnum = np.sum(tempnum) tempnum /= (len(expec) - 2) tempnum = np.sqrt(tempnum) thevar = stats.tvar(x_observ) thestdev = np.sqrt(thevar) tempnum /= thestdev return tempnum
def sasha_slope_error(expec, observ, x_observ): tempnum = (expec - observ)**2 tempnum = np.sum(tempnum) tempnum /= (len(expec) -2 ) tempnum = np.sqrt(tempnum) thevar = stats.tvar(x_observ) thestdev = np.sqrt(thevar) tempnum /= thestdev return tempnum
def plot(self,jobid,job_data=None): if not self.setup(jobid,job_data=job_data): return ts=self.ts host_cpi = {} host_names = sorted(ts.data[0].keys()) for v in host_names: ncores = len(ts.data[0][v]) num = 0 den = 0 for k in range(ncores): ratio = nan_to_num(diff(ts.data[0][v][k]) / diff(ts.data[1][v][k])) try: cpi = vstack((cpi,ratio)) except: cpi = array([ratio]) num += diff(ts.data[0][v][k]) den += diff(ts.data[1][v][k]) host_cpi[v] = tmean(nan_to_num(num/den)) mean_cpi = tmean(host_cpi.values()) if len(host_cpi.values()) > 1: var_cpi = tvar(host_cpi.values()) else: var_cpi= 0.0 self.fig = Figure(figsize=(10,12),dpi=110) self.ax=self.fig.add_subplot(1,1,1) ycore = arange(cpi.shape[0]+1) time = ts.t/3600. yhost=arange(len(host_cpi.keys())+1)*ncores + ncores fontsize = 8 set_printoptions(precision=4) if len(yhost) > 80: fontsize /= 0.5*log(len(yhost)) self.ax.set_ylim(bottom=ycore.min(),top=ycore.max()) self.ax.set_yticks(yhost[0:-1]-ncores/2.) self.ax.set_yticklabels([key +'(' + "{0:.2f}".format(host_cpi[key]) +')' for key in host_names],fontsize=fontsize) self.ax.set_xlim(left=time.min(),right=time.max()) pcm = self.ax.pcolor(time, ycore, cpi,vmin=0.0,vmax=5.0) pcm.cmap = cm.get_cmap('jet_r') try: self.ax.set_title(self.k2[ts.pmc_type][0] +'/'+self.k2[ts.pmc_type][1] + '\n' + r'Mean(Std)='+'{0:.2f}'.format(mean_cpi)+r'({0:.2f})'.format(sqrt(var_cpi))) except: self.ax.set_title(self.k2[0] +'/'+self.k2[1] + '\n'+ r'$\bar{Mean}$='+'{0:.2f}'.format(mean_cpi)+r'$\pm$'+'{0:.2f}'.format(sqrt(var_cpi))) self.fig.colorbar(pcm) self.ax.set_xlabel('Time (hrs)') self.output('heatmap')
def more_constraint_stats(constraint_dist): nan_count = sum(math.isnan(x) for x in constraint_dist) one_count = constraint_dist.count(1) half_count = constraint_dist.count(0.5) filtered = [ x for x in constraint_dist if (not math.isnan(x)) and x != 0.5 and x != 1 ] return nan_count, one_count, half_count, stats.tmean(filtered), stats.tvar( filtered), stats.skew(filtered), stats.kurtosis(filtered)
def print_and_plot_results(count, results, verbose, plot_file_name): print("RPS calculated as 95% confidence interval") rps_mean_ar = [] low_ar = [] high_ar = [] test_name_ar = [] for test_name in sorted(results): data = results[test_name] rps = count / array(data) rps_mean = tmean(rps) rps_var = tvar(rps) low, high = norm.interval(0.95, loc=rps_mean, scale=rps_var**0.5) times = array(data) * 1000000 / count times_mean = tmean(times) times_stdev = tstd(times) print('Results for', test_name) print('RPS: {:d}: [{:d}, {:d}],\tmean: {:.3f} μs,' '\tstandard deviation {:.3f} μs'.format(int(rps_mean), int(low), int(high), times_mean, times_stdev)) test_name_ar.append(test_name) rps_mean_ar.append(rps_mean) low_ar.append(low) high_ar.append(high) if verbose: print(' from', times) print() if plot_file_name is not None: import matplotlib.pyplot as plt from matplotlib import cm fig = plt.figure() ax = fig.add_subplot(111) L = len(rps_mean_ar) color = [cm.autumn(float(c) / (L - 1)) for c in arange(L)] bars = ax.bar(arange(L), rps_mean_ar, color=color, yerr=(low_ar, high_ar), ecolor='k') # order of legend is reversed for visual appeal ax.legend(reversed(bars), reversed(test_name_ar), loc='upper left') ax.get_xaxis().set_visible(False) plt.ylabel('Requets per Second', fontsize=16) print(plot_file_name) plt.savefig(plot_file_name, dpi=96) print("Plot is saved to {}".format(plot_file_name)) if verbose: plt.show()
def confidence_interval(errors): # tvar is the sample variance from scipy.stats import norm, tvar import math mu = sum(errors) / float(len(errors)) var = tvar(errors) std_dev = math.sqrt(var) std_error = std_dev / math.sqrt(len(errors)) span_95 = norm.interval(0.95, loc=mu, scale=std_error) return span_95
def var_truncNormal(a, b, mu, sigma, data, mod=3000.0): x1 = (a - mu)/sigma * stats.norm.pdf(a, mu, sigma) x2 = (b - mu)/sigma * stats.norm.pdf(b, mu, sigma) cx = stats.norm.cdf(b, mu, sigma) - stats.norm.cdf(a, mu, sigma) yhat = stats.tvar(data, limits=[mu-mod, mu+mod], inclusive=(False, False)) sigma2 = yhat/((1+(x1-x2)/cx - ((x1-x2)/cx)**2)) sigma = scipy.sqrt(sigma2) return sigma
def test_calculate_incremental_variance(self): control_sample = [] sample = [] var = 0 for i in range(0, 20): for _ in range(0, 100): elem = random() sample.append(elem) control_sample.append(elem) var = self.stat.calculate_incremental_variance(sample) sample.clear() control = tvar(control_sample) self.assertAlmostEqual(var, control)
def learn(tableau_final, metric_seuil, time_max, learning_rate): sigma1 = np.sqrt(stats.tvar(data["GasCum360"])) sigma2 = np.sqrt(stats.tvar(data["OilCum360"])) t0 = time.time() u, v, x, y = 0.5, 0.5, 0.5, 0.5 tableau_final["Gas360_SUP"] = tableau_final["GasCum360"] + u * sigma1 tableau_final["Gas360_INF"] = tableau_final["GasCum360"] - v * sigma1 tableau_final["Oil360_SUP"] = tableau_final["OilCum360"] + x * sigma2 tableau_final["Oil360_INF"] = tableau_final["OilCum360"] - y * sigma2 X, Y = [], [] while metric(tableau_final) > metric_seuil: if time.time() - t0 > time_max: break u += learning_rate v += learning_rate x += learning_rate y += learning_rate tableau_final["Gas360_SUP"] = tableau_final["GasCum360"] + u * sigma1 tableau_final["Gas360_INF"] = tableau_final["GasCum360"] - v * sigma1 tableau_final["Oil360_SUP"] = tableau_final["OilCum360"] + x * sigma2 tableau_final["Oil360_INF"] = tableau_final["OilCum360"] - y * sigma2 m = metric(tableau_final) X.append(u) Y.append(m) print(u) print(m) if u > 1: break plt.plot(X, Y) plt.show() minY = min(Y) minX = 0 for j, value in enumerate(Y): if value == minY: minX = X[j] print([minY, minX]) tableau_final["Gas360_SUP"] = tableau_final["GasCum360"] + minX * sigma1 tableau_final["Gas360_INF"] = tableau_final["GasCum360"] - minX * sigma1 tableau_final["Oil360_SUP"] = tableau_final["OilCum360"] + minX * sigma2 tableau_final["Oil360_INF"] = tableau_final["OilCum360"] - minX * sigma2 return (minY, minX)
def random_sample(frame: pd, samples_total=300, sample_size=30, var=False): sample_stats = [] for a_sample in range(samples_total): # pick random elts of sample_size and add to samples list samples = [] for elt in range(sample_size): samples.append(frame.iloc[floor(frame.shape[0] * (random.random()))]) if var: sample_stats.append(stats.tvar(samples)) else: sample_stats.append(stats.tmean(samples)) return sample_stats
def getLineScoreStats(df,lineScoreCol,histScoreCol,binNumber=50): '''Return a Dataframe of line score stats for each bin. Relevant one is probably the mean.''' D = {} binnedScores = binLineScore(df,lineScoreCol,histScoreCol,binNumber) for bin in binnedScores: L = binnedScores[bin] if len(L) <=1: mean,var,dev = L[0],0,0 continue mean = stats.tmean(L) var = stats.tvar(L) stanD = stats.tstd(L) D[bin] = {"mean":mean,"var":var,"stanDev.": stanD} return pd.DataFrame(D).T
def calculateStats(data): """ Calculate statistics on a numeric array data and return them in a dictionary @ In, data, list or numpy.array, the data @ Out, ret, dict, the dictionary containing the stats """ ret = {} ret["mean"] = np.mean(data) ret["variance"] = np.var(data) ret["sampleVariance"] = stats.tvar(data) ret["stdev"] = stats.tstd(data) ret["skewness"] = stats.skew(data) ret["kurtosis"] = stats.kurtosis(data) return ret
def calc_channel_hist(img): chans = cv2.split(img) #print(chans) mean=[] kurtosis=[] variance=[] skew=[] for chan in chans: # Calculate the histogram histo = cv2.calcHist([chan],[0],None,[100],[0,256]) # Normalize the histogram hist_length = sum(histo) hist = [float(h) / hist_length for h in histo] #print(ss.describe(hist)) skew.append(sp.skew(hist)[0]) kurtosis.append(sp.kurtosis(hist)[0]) mean.append(sp.tmean(hist)) variance.append(sp.tvar(hist)) return mean,variance,kurtosis,skew
def plotPDF(self): print "plot the PDF stuffs" figure(1) print shape(self.gather) for each in self.gather: print len(each) smoothness = 75 kde = [] distSpace = [] p = [] txt = [] for i in range(3): kde.append(list()) distSpace.append(list()) p.append(list()) txt.append(list()) lbl = [5, 10, 15] distxx = [0.25, 0.50, .75] for kd, dS, gat, pl, lb in \ zip(kde, distSpace,self.gather, p, lbl): kd = gaussian_kde(gat) dS = linspace(min(gat), max(gat), smoothness) pl = plt.plot(dS, kd(dS), label="%s units from source" % lb) title("Probability density function of plume down stream from source") mean = 12 variance = 1 sigma = np.sqrt(variance) x = linspace(9, 15, 100) plt.plot(x, mlab.normpdf(x, mean, sigma), label="normal distribution") for lb, t, gat, dis in zip(lbl, txt, self.gather, distxx): t = ("%s units from source:\nskew: %4.4f\nvariance: %4.4f" \ %(lb, ss.skew(gat) , ss.tvar(gat))) xloc = xlim()[0] + 0.15 * diff(xlim()) yloc = ylim()[0] + dis * diff(ylim()) text(xloc, yloc, t) plt.legend() plt.show()
def calc_channel_hist(img): chans = cv2.split(img) #print(chans) mean = [] kurtosis = [] variance = [] skew = [] for chan in chans: # Calculate the histogram histo = cv2.calcHist([chan], [0], None, [100], [0, 256]) # Normalize the histogram hist_length = sum(histo) hist = [float(h) / hist_length for h in histo] #print(ss.describe(hist)) skew.append(sp.skew(hist)[0]) kurtosis.append(sp.kurtosis(hist)[0]) mean.append(sp.tmean(hist)) variance.append(sp.tvar(hist)) return mean, variance, kurtosis, skew
def daubtran(self,event): h0=(1+m.sqrt(3))/(4*m.sqrt(2)) h1=(3+m.sqrt(3))/(4*m.sqrt(2)) h2=(3-m.sqrt(3))/(4*m.sqrt(2)) h3=(1-m.sqrt(3))/(4*m.sqrt(2)) g0 = h3 g1 = -h2 g2 = h1 g3 = -h0 a=self.current_signal_val n=len(self.current_signal_val) print self.current_plot1_txt if (n>=4): half = n >> 1 tmp=[0]*n i=0 j=0 while (j<n-3): tmp[i] = a[j]*h0 + a[j+1]*h1 + a[j+2]*h2 + a[j+3]*h3 tmp[i+half] = a[j]*g0 + a[j+1]*g1 + a[j+2]*g2 + a[j+3]*g3 j += 2 i +=1 tmp[i] = a[n-2]*h0 + a[n-1]*h1 + a[0]*h2 + a[1]*h3 tmp[i+half] = a[n-2]*g0 + a[n-1]*g1 + a[0]*g2 + a[1]*g3 self.current_daub_value=tmp self.draw_plot2(self.current_daub_value,"daubechies plot") self.current_mean_value = np.mean(self.current_daub_value) self.current_median_value = np.median(self.current_daub_value) self.current_mode_value = int(st.mode(self.current_daub_value)[0]) self.current_kurtosis_value = st.kurtosis(self.current_daub_value) self.current_skew_value = st.skew(self.current_daub_value) self.current_variance_value = st.tvar(self.current_daub_value) self.st3.SetLabel("mean: "+str(self.current_mean_value)) self.st4.SetLabel("median: "+str(self.current_median_value)) self.st5.SetLabel("mode: "+str(self.current_mode_value)) self.st6.SetLabel("kurtosis: "+str(self.current_kurtosis_value)) self.st7.SetLabel("skew: "+str(self.current_skew_value)) self.st8.SetLabel("variance: "+str(self.current_variance_value))
def info(var_list,df): info_var = dict() mean = [] Var = [] mode = [] range_ = [] for v in var_list: mean.append((df[v].mean())) Var.append(stats.tvar(df[v])) mode.append(stats.mode(df[v])) range_.append([df[v].min(), df[v].max()]) info_var['Mean'] = mean info_var['Var'] = Var info_var['Mode'] = mode info_var['Range'] = range_ return pd.DataFrame(info_var, index = var_list)
def autocorr(x, k, SE=False): c0 = stats.tvar(x) #sample variance mu = stats.tmean(x) #sample mean r_arr = [1] T = float(len(x)) for j in range(1, k+1): T1 = int(T-j) cj = 0.0 for i in xrange(T1): cj += (x[i] - mu)*(x[i+j]-mu) cj = cj/T rj = cj/c0 r_arr.append(rj) SEk = autocorrSE(r_arr, k, T) tval = r_arr[-1]/SEk pval = 1 - stats.norm.cdf(tval) if SE: return r_arr, SEk else: return r_arr, pval
def plotPDFandData(self): smoothness = 75 kde7 = gaussian_kde(self.seventeens) dist_space7 = np.linspace(min(self.seventeens), max(self.seventeens), smoothness) p7 = plt.plot(dist_space7, kde7(dist_space7), label="10 units from source") pReal = plt.hist(self.seventeens, 200) txt7 = ("seventeens:\nskew: %4.4f\nkurtosis: %4.4f\nvariance: %4.4f" % (ss.skew(self.seventeens), ss.kurtosis( self.seventeens), ss.tvar(self.seventeens))) xloc = xlim()[0] + 0.15 * np.diff(xlim()) yloc = ylim()[0] + 0.50 * diff(ylim()) text(xloc, yloc, txt7) plt.legend() plt.show()
K = 1 Asian_Price = 0.0 dt = 0.0001 asian_prices = [] for i in range(n): S = S0 S_a = S0 aver = S0 Ka = S0 for j in range(0,T*10000): mean = (r-0.5*(o**2)) rand = random.gauss(mean*dt, o*math.sqrt(dt)) S = S*math.exp(rand) S_a = S_a*math.exp((2*mean*dt)-rand) aver = (S+S_a)/2.0 Ka = Ka+aver Ka = Ka/(T/dt) if Ka < aver: Asian_Price = Asian_Price + (aver-Ka)*math.exp(-r*T) asian_prices.append((aver-Ka)*math.exp(-r*T)) else: asian_prices.append(0.0) if i%100 == 0: print i print "dt = 0.0001 uSa = "+str(Asian_Price/n) print "mean = "+str(stats.tmean(asian_prices)) print "error = "+str(math.sqrt(stats.tvar(asian_prices))/math.sqrt(float(n)))
def _badPixMap(self,clip=30,filename='badpix.dmp'): median = np.median(self.image) var = tvar(self.image,(-100,100)) self.badpix = ma.masked_greater(self.image-median,clip*np.sqrt(var)) if filename is not None: self.badpix.dump(filename)
def PDM(times, fluxes, frequencies, numberOfBins = 10, binWidth = 0.1): """ Perform phase dispersion minimization. Need to add option for flux errors. """ # Offset time array to make t0 = 0 zeroPoint = times[0] times -= zeroPoint # Total number of data points, frequencies numberOfData = len(times) numberOfFrequencies = len(frequencies) # Calculate width used to center the bins widthForCenter = 1/float(numberOfBins) dispersions = np.zeros(len(frequencies)) # Loop through total number of frequencies for iFrequency in range(numberOfFrequencies): # Initialize array for number of points in bin, may need to place in loop numPoints = np.zeros(numberOfBins) binVariance = np.zeros(numberOfBins) # Convert times to phase folded on frequencies[iFrequency], sort times, fluxes, fluxErrors sortedPhases, sortedFluxes = FoldTimes(times, fluxes, frequencies[iFrequency]) overallVariance = stats.tvar(sortedFluxes) # Loop through total number of bins for iBin in range(numberOfBins): # Use 'binWidth' to determine the min/max values of the bin binCenter = (iBin+1)*widthForCenter - 0.5*widthForCenter binMin = binCenter - 0.5*binWidth binMax = binCenter + 0.5*binWidth # Pick out fluxes that have associated phase between binMin and binMax # Account for bins with phases < 0 and > 1 sample = sortedFluxes[np.where(np.logical_or(np.logical_or(np.logical_and(sortedPhases < binMax, sortedPhases >= binMin),np.logical_and(sortedPhases - 1 < binMax, sortedPhases - 1 >= binMin)),np.logical_and(sortedPhases + 1 < binMax, sortedPhases + 1 >= binMin)))] numPoints[iBin] = len(sample) # Calculate the variances of individual bins if numPoints[iBin] > 1: binVariance[iBin] = stats.tvar(sample) else: binVariance[iBin] = 0. # Calculate overall variance for samples numerator = 0. denominator = 0. for iBin in range(numberOfBins): numerator += (float(numPoints[iBin])-1)*binVariance[iBin] denominator += float(numPoints[iBin]) denominator -= numberOfBins sampleVariance = numerator/denominator # Calculate dispersion measure dispersions[iFrequency] = sampleVariance/overallVariance return dispersions
if rules is 1: profit_list,profit,status_list,entry_list,exit_list,entry_price_list,exit_price_list= trade_stock_1(mv_cp[choice], m_op.data[choice], m_cp.data[choice], varloss, vargain, N=N, alpha=alpha) elif rules is 2: profit_list,profit,status_list,entry_list,exit_list,entry_price_list,exit_price_list= trade_stock_2(mv_cp[choice], m_op.data[choice], m_cp.data[choice], vargain, N=N, alpha=alpha) elif rules is 3: profit_list,profit,status_list,entry_list,exit_list,entry_price_list,exit_price_list= trade_stock_3(mv_cp[choice], m_op.data[choice], m_cp.data[choice], varloss, N=N, alpha=alpha) elif rules is 4: profit_list,profit,status_list,entry_list,exit_list,entry_price_list,exit_price_list= trade_stock_4(mv_cp[choice], m_op.data[choice], m_cp.data[choice], N=N, alpha=alpha) total_profit[choice] = profit temp = [i for i in status_list if i is not "none"] if len(temp) != len(profit_list): temp = temp[0:len(temp)-1] no_profit_trade[choice] = len([i for i in profit_list if i>0]) no_loss_trade[choice] = len(profit_list)-no_profit_trade[choice] no_long_trade[choice] = len([i for i in temp if i is "long"]) no_short_trade[choice] = len([i for i in temp if i is "short"]) no_long_profit[choice] = len([i for i in range(0,len(temp)) if (temp[i] is "long")&(profit_list[i]>0)]) no_short_profit[choice] = len([i for i in range(0,len(temp)) if (temp[i] is "short")&(profit_list[i]>0)]) no_long_loss[choice] = no_long_trade[choice] - no_long_profit[choice] no_short_loss[choice] = no_short_trade[choice] - no_short_profit[choice] test_stats[choice] = sps.tmean(profit_list)*len(profit_list)/np.power(sps.tvar(profit_list), 0.5) end = time.time() duration = end-start print("The total time is {0} minutes".format(duration/60)) allresult = np.c_[total_profit, no_profit_trade, no_loss_trade, no_long_trade, no_short_trade, no_long_profit, no_short_profit, no_long_loss, no_short_loss, test_stats] s = "\n".join([m_cp.header[j]+","+",".join(["{0}".format(i) for i in allresult[j]]) for j in range(0,K)]) s = ",total_profit,no_profit_trade,no_loss_trade,no_long_trade, no_short_trade,no_long_profit, no_short_profit, no_long_loss, no_short_loss, test_stats\n"+s f = open(filename, "w") f.write(s) f.close()
def cross_validation(transactions, sample_pct=0.50, support=-3, all_frequent_items=None): from fim import fpgrowth """ Cross validation, 'old' version not using compatct triangle representation from Forward. """ # init _id = str(time()).replace('.','') # if all_frequent_items is None: # all_frequent_items = fpgrowth(transactions, supp=support, min=1, max=3) cv_start = time() print "\n### Running cross validation {}###".format(_id) print "Total transactions:{}".format(len(transactions)) # print "Total frequest items:{}".format(len(all_frequent_items)) # run results avg_errors = [] var_errors = [] # all_triangles, all_triples = filter_items(all_frequent_items) for chunk, index, rest in chunks(transactions, int(len(transactions) * sample_pct)):# TODO insert proper sampling all_frequent_items = fpgrowth(rest, supp=support, min=1, max=3) all_triangles, all_triples = Forward.forward(all_frequent_items) # Get triples for estimates frequent_items = fpgrowth(chunk, supp=support, min=1, max=3) if len(frequent_items) > 0: print 'frequent items: {}'.format(len(frequent_items)) else: print 'No frequent items in chunk: {}'.format(index) continue triangles, triples = Forward.forward(frequent_items) print 'triangles: {}'.format(len(triangles)) estimates = [] observations = [] abs_errors = [] max_est = 0 max_obs = 0 for (s1, s2, s3, s12, s23, s13, s123) in triangles: # if s123[1] != 0: # continue # maxent estimate from the sample. # Index [1] of the tuples hold the # occurences in the sample est = ent.maxent_est_rosa(s1[1], s2[1], s3[1], s12[1], s23[1], s13[1], float(len(transactions)-len(chunk)), num=int(math.log(len(transactions), 2))+1) # maxumum estiamte seen (for plotting) max_est = max(max_est, est) # record the estimate estimates.append(est) # from all observed triples get the actual observed number of triples observed = 0 if all_triples.has_key(s123[0]): observed = all_triples[s123[0]] # maximum observation of the triple (for plotting) max_obs = max(max_obs, observed) # record the observed observations.append(observed) # record abs error error = abs(obs-est) / float(obs) * 100 abs_errors.append(error) if len(abs_errors) > 0: #TODO handle this, probably when nothing has been found # evaluation min_error = min(abs_errors) max_error = max(abs_errors) avg_error = sum(abs_errors) / float(len(abs_errors)) avg_errors.append(avg_error) var_error = 0 if len(abs_errors) > 1: var_error = tvar(abs_errors) #tvar is the sample variance var_errors.append(var_error) # TODO histogram of the average errors. max-ent, extrapolation, heurestic # TODO print average error og the average errors to the log. res_string = "\nResult:\nSample size:{} min_error:{} max_error:{} avg_error:{} var_error:{}".format(len(chunk), min_error, max_error, avg_errors[-1], var_error) print res_string else: print 'No abs errors!' print "Cross validation done!" print "time: ", (time() - cv_start) total_avg_error = sum(avg_errors)/float(len(avg_errors)) total_res_string = "Avg error:{}".format(total_avg_error) return path
#!/usr/bin/env python #-*- coding:utf8 -*- ''' TODO 理解这一段话 NumPy是一个定义了数值数组和矩阵类型和它们的基本运算的语言扩展。 SciPy是另一种使用NumPy来做高等数学、信号处理、优化、统计和许多其它科学任务的语言扩展。 Matplotlib是一个帮助绘图的语言扩展。 ''' # 我们来搞定科学计算 import numpy from scipy import stats XXX_ar = stats.pearsonr([XXX]) print stats.tvar(XXX_ar), stats.tstd(XXX_ar), stats.tmean(XXX_ar) # pearson product moment efficent print stats.pearsonr(XXX_LISTA, XXX_LISTB) print numpy.log2(1024) print numpy.log10(0) print numpy.log(XXX) #it's ln print numpy.exp(1) print numpy.e, numpy.pi
def test_tvarX(self): y = stats.tvar(X, (2, 8), (True, True)) assert_almost_equal(y, 4.6666666666666661)
Description: Source code 1 on a course of bootstrapping. It demonstrates how to estimate mean and its standard error by bootstrapping. """ ########################################################################################################################################################################### import numpy as np import numpy.random as npr import scipy.stats as sps import scipy as sp import matplotlib.pyplot as plt import pandas as pd N = 1000 #Initial sample size B = 500 #number of bootstrap sample ie replication. m = 3 #True mean of the data s = 2 #True standard deviation of the data. data = sps.norm.rvs(size=N, loc=m, scale=s) #generating the random sample from normality. mhat = sps.tmean(data) #calculate sample mean estimate. shat2 = sps.tvar(data) #calcualte sample variance estimate. bootsample = [npr.choice(data,size=N,replace=True) for i in range(0,B)] #generate B bootstrap samples. bootmean = [sps.tmean(j) for j in bootsample] plt.hist(bootmean,bins=np.floor(B/10)) plt.show() columns = ['True', 'Estimated', 'Bootstrap'] index = ['mean', 'variance'] result = [ [m,mhat,sps.tmean(bootmean)], [np.power(s,2)/N, shat2/N, sps.tvar(bootmean)]] result = np.array(result) resultpd = pd.DataFrame(result, columns=columns, index=index) print(resultpd)
N = 1000 #sample size B = 500 #bootstrap sample size ie number of replication a = 1 #true interpcept b = 0.5 #true slope s = 0.4 #true variance e = sps.norm.rvs(size=N,loc=0, scale=s) #simulating residual vector x = sps.norm.rvs(size=N,loc=2,scale=1) #simulating explanatory variable y = a+b*x+e #constructing dependent variable m = np.c_[y,x] #constructing dataset ols_main = lm.lm('y~c+x', data=m, header=['y','x']) #estimating linear regression based on simulated dataset ols_main.estimate() coef = np.zeros((B,2)) #initiate vector to store bootstrapped coefficient estiamtes. for j in range(0,B): index = npr.choice(range(0,N), size=N, replace=True) #construct index set for bootstrap sample. bootsample = m[list(index)] #extract bootstrap sample. ols_temp = lm.lm('y~c+x', data=bootsample, header=['y','x']) #estimate regression based on bootstrap sample ols_temp.estimate() coef[j] = ols_temp.coef.reshape((1,2)) #store bootstrap estimate ####################################Calculate the true variance-covariance matrix of the OLS estimate########################### tempx = np.c_[np.ones(N), x] cov = np.power(s,2)*np.linalg.inv(np.dot(tempx.transpose(), tempx)) truecov = np.diag(cov) ############################################################################################################################### summary = np.c_[truecov.reshape((2,1)), np.diag(ols_main.cov).reshape((2,1)), np.array([sps.tvar(coef[:,i]) for i in range(0,2)]).reshape((2,1))] summary = np.r_[np.c_[np.r_[a,b], ols_main.coef.reshape((2,1)), np.array([sps.tmean(coef[:,i]) for i in range(0,2)]).reshape((2,1))], summary] header = ['Theoretical', 'Sample Estimate', 'Bootstrap Estiamte'] labelx = ['a','b','var a', 'var b'] result = pd.DataFrame(summary, columns=header, index = labelx) print(result)
def cross_validation_compact(transactions, sample_pct=0.50, support=-3, all_frequent_items=None): from fim import fpgrowth """ Cross validation. Using compact representation from Forward. """ # init _id = str(time()).replace('.','') # if all_frequent_items is None: # all_frequent_items = fpgrowth(transactions, supp=support, min=1, max=3) cv_start = time() print "\n### Running cross validation {}###".format(_id) print "Total transactions:{}".format(len(transactions)) # print "Total frequest items:{}".format(len(all_frequent_items)) # run results avg_errors = [] var_errors = [] # all_triangles, all_triples = filter_items(all_frequent_items) for chunk, index, rest in chunks(transactions, int(len(transactions) * sample_pct)):# TODO insert proper sampling all_frequent_items = fpgrowth(rest, supp=support, min=1, max=3) all_triangles, all_triples = Forward.forward_compact(all_frequent_items) # Get triples for estimates frequent_items = fpgrowth(chunk, supp=support, min=1, max=3) if len(frequent_items) > 0: print 'frequent items: {}'.format(len(frequent_items)) else: print 'No frequent items in chunk: {}'.format(index) continue triangle_tree, triples = Forward.forward_compact(frequent_items) print 'triangle roots: {}'.format(len(triangle_tree)) estimates = [] observations = [] abs_errors = [] max_est = 0 max_obs = 0 # DFS of the tree holding all triangles for n1 in triangle_tree.keys(): s1, s2_dict = triangle_tree[n1] for n2 in s2_dict.keys(): s2, s12, s3_dict = s2_dict[n2] for n3 in s3_dict.keys(): s3, s13, s23, s123 = s3_dict[n3] est = ent.maxent_est_rosa(s1, s2, s3, s12, s23, s13, float(len(transactions)-len(chunk)), num=int(math.log(len(transactions), 2))+1) # maxumum estiamte seen (for plotting) max_est = max(max_est, est) # record the estimate estimates.append(est) # from all observed triples get the actual observed number of triples observed = 0 if all_triples.has_key((n1, n2, n3)): observed = all_triples[(n1, n2, n3)] # maximum observation of the triple (for plotting) max_obs = max(max_obs, observed) # record the observed observations.append(observed) # record abs error error = abs(obs-est) / float(obs) * 100 abs_errors.append(error) if len(abs_errors) > 0: #TODO handle this, probably when nothing has been found # evaluation min_error = min(abs_errors) max_error = max(abs_errors) avg_error = sum(abs_errors) / float(len(abs_errors)) avg_errors.append(avg_error) var_error = 0 if len(abs_errors) > 1: var_error = tvar(abs_errors) #tvar is the sample variance var_errors.append(var_error) res_string = "\nResult:\nSample size:{} min_error:{} max_error:{} avg_error:{} var_error:{}".format(len(chunk), min_error, max_error, avg_errors[-1], var_error) print res_string else: print 'No abs errors!' print "Cross validation done!" print "time: ", (time() - cv_start) total_avg_error = sum(avg_errors)/float(len(avg_errors)) total_res_string = "Avg error:{}".format(total_avg_error)
def main(): parser = argparse.ArgumentParser() parser.add_argument("-i", "--infile", required=True, help="Tabular file.") parser.add_argument("-o", "--outfile", required=True, help="Path to the output file.") parser.add_argument("--sample_one_cols", help="Input format, like smi, sdf, inchi") parser.add_argument("--sample_two_cols", help="Input format, like smi, sdf, inchi") parser.add_argument("--sample_cols", help="Input format, like smi, sdf, inchi,separate arrays using ;") parser.add_argument("--test_id", help="statistical test method") parser.add_argument( "--mwu_use_continuity", action="store_true", default=False, help="Whether a continuity correction (1/2.) should be taken into account.", ) parser.add_argument( "--equal_var", action="store_true", default=False, help="If set perform a standard independent 2 sample test that assumes equal population variances. If not set, perform Welch's t-test, which does not assume equal population variance.", ) parser.add_argument( "--reta", action="store_true", default=False, help="Whether or not to return the internally computed a values." ) parser.add_argument("--fisher", action="store_true", default=False, help="if true then Fisher definition is used") parser.add_argument( "--bias", action="store_true", default=False, help="if false,then the calculations are corrected for statistical bias", ) parser.add_argument("--inclusive1", action="store_true", default=False, help="if false,lower_limit will be ignored") parser.add_argument( "--inclusive2", action="store_true", default=False, help="if false,higher_limit will be ignored" ) parser.add_argument("--inclusive", action="store_true", default=False, help="if false,limit will be ignored") parser.add_argument( "--printextras", action="store_true", default=False, help="If True, if there are extra points a warning is raised saying how many of those points there are", ) parser.add_argument( "--initial_lexsort", action="store_true", default="False", help="Whether to use lexsort or quicksort as the sorting method for the initial sort of the inputs.", ) parser.add_argument("--correction", action="store_true", default=False, help="continuity correction ") parser.add_argument( "--axis", type=int, default=0, help="Axis can equal None (ravel array first), or an integer (the axis over which to operate on a and b)", ) parser.add_argument( "--n", type=int, default=0, help="the number of trials. This is ignored if x gives both the number of successes and failures", ) parser.add_argument("--b", type=int, default=0, help="The number of bins to use for the histogram") parser.add_argument("--N", type=int, default=0, help="Score that is compared to the elements in a.") parser.add_argument("--ddof", type=int, default=0, help="Degrees of freedom correction") parser.add_argument("--score", type=int, default=0, help="Score that is compared to the elements in a.") parser.add_argument("--m", type=float, default=0.0, help="limits") parser.add_argument("--mf", type=float, default=2.0, help="lower limit") parser.add_argument("--nf", type=float, default=99.9, help="higher_limit") parser.add_argument( "--p", type=float, default=0.5, help="The hypothesized probability of success. 0 <= p <= 1. The default value is p = 0.5", ) parser.add_argument("--alpha", type=float, default=0.9, help="probability") parser.add_argument("--new", type=float, default=0.0, help="Value to put in place of values in a outside of bounds") parser.add_argument( "--proportiontocut", type=float, default=0.0, help="Proportion (in range 0-1) of total data set to trim of each end.", ) parser.add_argument( "--lambda_", type=float, default=1.0, help="lambda_ gives the power in the Cressie-Read power divergence statistic", ) parser.add_argument( "--imbda", type=float, default=0, help="If lmbda is not None, do the transformation for that value.If lmbda is None, find the lambda that maximizes the log-likelihood function and return it as the second output argument.", ) parser.add_argument("--base", type=float, default=1.6, help="The logarithmic base to use, defaults to e") parser.add_argument("--dtype", help="dtype") parser.add_argument("--med", help="med") parser.add_argument("--cdf", help="cdf") parser.add_argument("--zero_method", help="zero_method options") parser.add_argument("--dist", help="dist options") parser.add_argument("--ties", help="ties options") parser.add_argument("--alternative", help="alternative options") parser.add_argument("--mode", help="mode options") parser.add_argument("--method", help="method options") parser.add_argument("--md", help="md options") parser.add_argument("--center", help="center options") parser.add_argument("--kind", help="kind options") parser.add_argument("--tail", help="tail options") parser.add_argument("--interpolation", help="interpolation options") parser.add_argument("--statistic", help="statistic options") args = parser.parse_args() infile = args.infile outfile = open(args.outfile, "w+") test_id = args.test_id nf = args.nf mf = args.mf imbda = args.imbda inclusive1 = args.inclusive1 inclusive2 = args.inclusive2 sample0 = 0 sample1 = 0 sample2 = 0 if args.sample_cols != None: sample0 = 1 barlett_samples = [] for sample in args.sample_cols.split(";"): barlett_samples.append(map(int, sample.split(","))) if args.sample_one_cols != None: sample1 = 1 sample_one_cols = args.sample_one_cols.split(",") if args.sample_two_cols != None: sample_two_cols = args.sample_two_cols.split(",") sample2 = 1 for line in open(infile): sample_one = [] sample_two = [] cols = line.strip().split("\t") if sample0 == 1: b_samples = columns_to_values(barlett_samples, line) if sample1 == 1: for index in sample_one_cols: sample_one.append(cols[int(index) - 1]) if sample2 == 1: for index in sample_two_cols: sample_two.append(cols[int(index) - 1]) if test_id.strip() == "describe": size, min_max, mean, uv, bs, bk = stats.describe(map(float, sample_one)) cols.append(size) cols.append(min_max) cols.append(mean) cols.append(uv) cols.append(bs) cols.append(bk) elif test_id.strip() == "mode": vals, counts = stats.mode(map(float, sample_one)) cols.append(vals) cols.append(counts) elif test_id.strip() == "nanmean": m = stats.nanmean(map(float, sample_one)) cols.append(m) elif test_id.strip() == "nanmedian": m = stats.nanmedian(map(float, sample_one)) cols.append(m) elif test_id.strip() == "kurtosistest": z_value, p_value = stats.kurtosistest(map(float, sample_one)) cols.append(z_value) cols.append(p_value) elif test_id.strip() == "variation": ra = stats.variation(map(float, sample_one)) cols.append(ra) elif test_id.strip() == "itemfreq": freq = stats.itemfreq(map(float, sample_one)) for list in freq: elements = ",".join(map(str, list)) cols.append(elements) elif test_id.strip() == "nanmedian": m = stats.nanmedian(map(float, sample_one)) cols.append(m) elif test_id.strip() == "variation": ra = stats.variation(map(float, sample_one)) cols.append(ra) elif test_id.strip() == "boxcox_llf": IIf = stats.boxcox_llf(imbda, map(float, sample_one)) cols.append(IIf) elif test_id.strip() == "tiecorrect": fa = stats.tiecorrect(map(float, sample_one)) cols.append(fa) elif test_id.strip() == "rankdata": r = stats.rankdata(map(float, sample_one), method=args.md) cols.append(r) elif test_id.strip() == "nanstd": s = stats.nanstd(map(float, sample_one), bias=args.bias) cols.append(s) elif test_id.strip() == "anderson": A2, critical, sig = stats.anderson(map(float, sample_one), dist=args.dist) cols.append(A2) for list in critical: cols.append(list) cols.append(",") for list in sig: cols.append(list) elif test_id.strip() == "binom_test": p_value = stats.binom_test(map(float, sample_one), n=args.n, p=args.p) cols.append(p_value) elif test_id.strip() == "gmean": gm = stats.gmean(map(float, sample_one), dtype=args.dtype) cols.append(gm) elif test_id.strip() == "hmean": hm = stats.hmean(map(float, sample_one), dtype=args.dtype) cols.append(hm) elif test_id.strip() == "kurtosis": k = stats.kurtosis(map(float, sample_one), axis=args.axis, fisher=args.fisher, bias=args.bias) cols.append(k) elif test_id.strip() == "moment": n_moment = stats.moment(map(float, sample_one), n=args.n) cols.append(n_moment) elif test_id.strip() == "normaltest": k2, p_value = stats.normaltest(map(float, sample_one)) cols.append(k2) cols.append(p_value) elif test_id.strip() == "skew": skewness = stats.skew(map(float, sample_one), bias=args.bias) cols.append(skewness) elif test_id.strip() == "skewtest": z_value, p_value = stats.skewtest(map(float, sample_one)) cols.append(z_value) cols.append(p_value) elif test_id.strip() == "sem": s = stats.sem(map(float, sample_one), ddof=args.ddof) cols.append(s) elif test_id.strip() == "zscore": z = stats.zscore(map(float, sample_one), ddof=args.ddof) for list in z: cols.append(list) elif test_id.strip() == "signaltonoise": s2n = stats.signaltonoise(map(float, sample_one), ddof=args.ddof) cols.append(s2n) elif test_id.strip() == "percentileofscore": p = stats.percentileofscore(map(float, sample_one), score=args.score, kind=args.kind) cols.append(p) elif test_id.strip() == "bayes_mvs": c_mean, c_var, c_std = stats.bayes_mvs(map(float, sample_one), alpha=args.alpha) cols.append(c_mean) cols.append(c_var) cols.append(c_std) elif test_id.strip() == "sigmaclip": c, c_low, c_up = stats.sigmaclip(map(float, sample_one), low=args.m, high=args.n) cols.append(c) cols.append(c_low) cols.append(c_up) elif test_id.strip() == "kstest": d, p_value = stats.kstest( map(float, sample_one), cdf=args.cdf, N=args.N, alternative=args.alternative, mode=args.mode ) cols.append(d) cols.append(p_value) elif test_id.strip() == "chi2_contingency": chi2, p, dof, ex = stats.chi2_contingency( map(float, sample_one), correction=args.correction, lambda_=args.lambda_ ) cols.append(chi2) cols.append(p) cols.append(dof) cols.append(ex) elif test_id.strip() == "tmean": if nf is 0 and mf is 0: mean = stats.tmean(map(float, sample_one)) else: mean = stats.tmean(map(float, sample_one), (mf, nf), (inclusive1, inclusive2)) cols.append(mean) elif test_id.strip() == "tmin": if mf is 0: min = stats.tmin(map(float, sample_one)) else: min = stats.tmin(map(float, sample_one), lowerlimit=mf, inclusive=args.inclusive) cols.append(min) elif test_id.strip() == "tmax": if nf is 0: max = stats.tmax(map(float, sample_one)) else: max = stats.tmax(map(float, sample_one), upperlimit=nf, inclusive=args.inclusive) cols.append(max) elif test_id.strip() == "tvar": if nf is 0 and mf is 0: var = stats.tvar(map(float, sample_one)) else: var = stats.tvar(map(float, sample_one), (mf, nf), (inclusive1, inclusive2)) cols.append(var) elif test_id.strip() == "tstd": if nf is 0 and mf is 0: std = stats.tstd(map(float, sample_one)) else: std = stats.tstd(map(float, sample_one), (mf, nf), (inclusive1, inclusive2)) cols.append(std) elif test_id.strip() == "tsem": if nf is 0 and mf is 0: s = stats.tsem(map(float, sample_one)) else: s = stats.tsem(map(float, sample_one), (mf, nf), (inclusive1, inclusive2)) cols.append(s) elif test_id.strip() == "scoreatpercentile": if nf is 0 and mf is 0: s = stats.scoreatpercentile( map(float, sample_one), map(float, sample_two), interpolation_method=args.interpolation ) else: s = stats.scoreatpercentile( map(float, sample_one), map(float, sample_two), (mf, nf), interpolation_method=args.interpolation ) for list in s: cols.append(list) elif test_id.strip() == "relfreq": if nf is 0 and mf is 0: rel, low_range, binsize, ex = stats.relfreq(map(float, sample_one), args.b) else: rel, low_range, binsize, ex = stats.relfreq(map(float, sample_one), args.b, (mf, nf)) for list in rel: cols.append(list) cols.append(low_range) cols.append(binsize) cols.append(ex) elif test_id.strip() == "binned_statistic": if nf is 0 and mf is 0: st, b_edge, b_n = stats.binned_statistic( map(float, sample_one), map(float, sample_two), statistic=args.statistic, bins=args.b ) else: st, b_edge, b_n = stats.binned_statistic( map(float, sample_one), map(float, sample_two), statistic=args.statistic, bins=args.b, range=(mf, nf), ) cols.append(st) cols.append(b_edge) cols.append(b_n) elif test_id.strip() == "threshold": if nf is 0 and mf is 0: o = stats.threshold(map(float, sample_one), newval=args.new) else: o = stats.threshold(map(float, sample_one), mf, nf, newval=args.new) for list in o: cols.append(list) elif test_id.strip() == "trimboth": o = stats.trimboth(map(float, sample_one), proportiontocut=args.proportiontocut) for list in o: cols.append(list) elif test_id.strip() == "trim1": t1 = stats.trim1(map(float, sample_one), proportiontocut=args.proportiontocut, tail=args.tail) for list in t1: cols.append(list) elif test_id.strip() == "histogram": if nf is 0 and mf is 0: hi, low_range, binsize, ex = stats.histogram(map(float, sample_one), args.b) else: hi, low_range, binsize, ex = stats.histogram(map(float, sample_one), args.b, (mf, nf)) cols.append(hi) cols.append(low_range) cols.append(binsize) cols.append(ex) elif test_id.strip() == "cumfreq": if nf is 0 and mf is 0: cum, low_range, binsize, ex = stats.cumfreq(map(float, sample_one), args.b) else: cum, low_range, binsize, ex = stats.cumfreq(map(float, sample_one), args.b, (mf, nf)) cols.append(cum) cols.append(low_range) cols.append(binsize) cols.append(ex) elif test_id.strip() == "boxcox_normmax": if nf is 0 and mf is 0: ma = stats.boxcox_normmax(map(float, sample_one)) else: ma = stats.boxcox_normmax(map(float, sample_one), (mf, nf), method=args.method) cols.append(ma) elif test_id.strip() == "boxcox": if imbda is 0: box, ma, ci = stats.boxcox(map(float, sample_one), alpha=args.alpha) cols.append(box) cols.append(ma) cols.append(ci) else: box = stats.boxcox(map(float, sample_one), imbda, alpha=args.alpha) cols.append(box) elif test_id.strip() == "histogram2": h2 = stats.histogram2(map(float, sample_one), map(float, sample_two)) for list in h2: cols.append(list) elif test_id.strip() == "ranksums": z_statistic, p_value = stats.ranksums(map(float, sample_one), map(float, sample_two)) cols.append(z_statistic) cols.append(p_value) elif test_id.strip() == "ttest_1samp": t, prob = stats.ttest_1samp(map(float, sample_one), map(float, sample_two)) for list in t: cols.append(list) for list in prob: cols.append(list) elif test_id.strip() == "ansari": AB, p_value = stats.ansari(map(float, sample_one), map(float, sample_two)) cols.append(AB) cols.append(p_value) elif test_id.strip() == "linregress": slope, intercept, r_value, p_value, stderr = stats.linregress( map(float, sample_one), map(float, sample_two) ) cols.append(slope) cols.append(intercept) cols.append(r_value) cols.append(p_value) cols.append(stderr) elif test_id.strip() == "pearsonr": cor, p_value = stats.pearsonr(map(float, sample_one), map(float, sample_two)) cols.append(cor) cols.append(p_value) elif test_id.strip() == "pointbiserialr": r, p_value = stats.pointbiserialr(map(float, sample_one), map(float, sample_two)) cols.append(r) cols.append(p_value) elif test_id.strip() == "ks_2samp": d, p_value = stats.ks_2samp(map(float, sample_one), map(float, sample_two)) cols.append(d) cols.append(p_value) elif test_id.strip() == "mannwhitneyu": mw_stats_u, p_value = stats.mannwhitneyu( map(float, sample_one), map(float, sample_two), use_continuity=args.mwu_use_continuity ) cols.append(mw_stats_u) cols.append(p_value) elif test_id.strip() == "zmap": z = stats.zmap(map(float, sample_one), map(float, sample_two), ddof=args.ddof) for list in z: cols.append(list) elif test_id.strip() == "ttest_ind": mw_stats_u, p_value = stats.ttest_ind( map(float, sample_one), map(float, sample_two), equal_var=args.equal_var ) cols.append(mw_stats_u) cols.append(p_value) elif test_id.strip() == "ttest_rel": t, prob = stats.ttest_rel(map(float, sample_one), map(float, sample_two), axis=args.axis) cols.append(t) cols.append(prob) elif test_id.strip() == "mood": z, p_value = stats.mood(map(float, sample_one), map(float, sample_two), axis=args.axis) cols.append(z) cols.append(p_value) elif test_id.strip() == "shapiro": W, p_value, a = stats.shapiro(map(float, sample_one), map(float, sample_two), args.reta) cols.append(W) cols.append(p_value) for list in a: cols.append(list) elif test_id.strip() == "kendalltau": k, p_value = stats.kendalltau( map(float, sample_one), map(float, sample_two), initial_lexsort=args.initial_lexsort ) cols.append(k) cols.append(p_value) elif test_id.strip() == "entropy": s = stats.entropy(map(float, sample_one), map(float, sample_two), base=args.base) cols.append(s) elif test_id.strip() == "spearmanr": if sample2 == 1: rho, p_value = stats.spearmanr(map(float, sample_one), map(float, sample_two)) else: rho, p_value = stats.spearmanr(map(float, sample_one)) cols.append(rho) cols.append(p_value) elif test_id.strip() == "wilcoxon": if sample2 == 1: T, p_value = stats.wilcoxon( map(float, sample_one), map(float, sample_two), zero_method=args.zero_method, correction=args.correction, ) else: T, p_value = stats.wilcoxon( map(float, sample_one), zero_method=args.zero_method, correction=args.correction ) cols.append(T) cols.append(p_value) elif test_id.strip() == "chisquare": if sample2 == 1: rho, p_value = stats.chisquare(map(float, sample_one), map(float, sample_two), ddof=args.ddof) else: rho, p_value = stats.chisquare(map(float, sample_one), ddof=args.ddof) cols.append(rho) cols.append(p_value) elif test_id.strip() == "power_divergence": if sample2 == 1: stat, p_value = stats.power_divergence( map(float, sample_one), map(float, sample_two), ddof=args.ddof, lambda_=args.lambda_ ) else: stat, p_value = stats.power_divergence(map(float, sample_one), ddof=args.ddof, lambda_=args.lambda_) cols.append(stat) cols.append(p_value) elif test_id.strip() == "theilslopes": if sample2 == 1: mpe, met, lo, up = stats.theilslopes(map(float, sample_one), map(float, sample_two), alpha=args.alpha) else: mpe, met, lo, up = stats.theilslopes(map(float, sample_one), alpha=args.alpha) cols.append(mpe) cols.append(met) cols.append(lo) cols.append(up) elif test_id.strip() == "combine_pvalues": if sample2 == 1: stat, p_value = stats.combine_pvalues( map(float, sample_one), method=args.med, weights=map(float, sample_two) ) else: stat, p_value = stats.combine_pvalues(map(float, sample_one), method=args.med) cols.append(stat) cols.append(p_value) elif test_id.strip() == "obrientransform": ob = stats.obrientransform(*b_samples) for list in ob: elements = ",".join(map(str, list)) cols.append(elements) elif test_id.strip() == "f_oneway": f_value, p_value = stats.f_oneway(*b_samples) cols.append(f_value) cols.append(p_value) elif test_id.strip() == "kruskal": h, p_value = stats.kruskal(*b_samples) cols.append(h) cols.append(p_value) elif test_id.strip() == "friedmanchisquare": fr, p_value = stats.friedmanchisquare(*b_samples) cols.append(fr) cols.append(p_value) elif test_id.strip() == "fligner": xsq, p_value = stats.fligner(center=args.center, proportiontocut=args.proportiontocut, *b_samples) cols.append(xsq) cols.append(p_value) elif test_id.strip() == "bartlett": T, p_value = stats.bartlett(*b_samples) cols.append(T) cols.append(p_value) elif test_id.strip() == "levene": w, p_value = stats.levene(center=args.center, proportiontocut=args.proportiontocut, *b_samples) cols.append(w) cols.append(p_value) elif test_id.strip() == "median_test": stat, p_value, m, table = stats.median_test( ties=args.ties, correction=args.correction, lambda_=args.lambda_, *b_samples ) cols.append(stat) cols.append(p_value) cols.append(m) cols.append(table) for list in table: elements = ",".join(map(str, list)) cols.append(elements) outfile.write("%s\n" % "\t".join(map(str, cols))) outfile.close()
def getStdEstimation(samples): return math.sqrt(tvar(samples))
def get_pooled_standard_error(cls, *args): """ Get the pooled standard error of the groups """ try: return sum(len(a)*stats.tvar(a) for a in args)/float(sum(len(a)-1 for a in args)) except: raise TypeError('Expected only lists or tuples')