def z_test(self, Features=None, Clstrs=None, val=0, alt="two-sided"): """ Test for mean based on normal distribution, one or two samples In the case of two samples, the samples are assumed to be independent. Parameters ---------- Features: 2D_array_like The arrays must have the same shape, except in the dimension Clstrs: array_like val: float In the one sample case, value is the mean of x1 under the Null hypothesis. In the two sample case, value is the difference between mean of x1 and mean of x2 under the Null hypothesis. The test statistic is x1_mean - x2_mean - value. alternative: str The alternative hypothesis, H1, has to be one of the following ‘two-sided’: H1: difference in means not equal to value (default) ‘larger’ : H1: difference in means larger than value ‘smaller’ : H1: difference in means smaller than value Returns ---------- statistic : float t-statistic. pvalue : float pvalue of the t-test. """ if Features is None: Features = self.__data.columns[:-1].copy() if Clstrs is None: Clstrs = self.__data["Clusters"].copy() Clstrs = Clstrs.dropna().unique().tolist() Clstrs.sort() for feature in Features: print("\n\n", feature,"\n") sub = self.__data[[feature, "Clusters"]].copy() sub = sub.dropna() sub_df = [] for cluster in Clstrs: sub_df.append(sub[sub["Clusters"] == cluster][feature].values.tolist()) if len(sub_df) > 1: for i, data_i in enumerate(sub_df): for j in range(i+1, len(sub_df)): res = ztest(sub_df[i], sub_df[j], value=val, alternative = alt) if res[1] < 0.05: print("The feature", feature, "is significant for clusters", Clstrs[i], "and", Clstrs[j]) else: print("The feature", feature, "is not significant for clusters", Clstrs[i], "and", Clstrs[j]) else: res = ztest(sub_df[0], value=val, alternative = alt) if res[1] < 0.05: print("The feature", feature, "is significant for cluster") else: print("The feature", feature, "is not significant for cluster")
def stats(self, x, y): if not self.diagonal: xflatten = np.delete(x, [i*(x.shape[0]+1)for i in range(x.shape[0])]) yflatten = np.delete(y, [i*(y.shape[0]+1)for i in range(y.shape[0])]) p = np.corrcoef(xflatten,yflatten) utils.printf('Pearson\'s correlation:\n{}'.format(p)) utils.printf('Z-Test:{}'.format(ztest(xflatten, yflatten))) utils.printf('T-Test:{}'.format(ttest_ind(xflatten, yflatten))) else: p = np.corrcoef(x, y) utils.printf('Pearson\'s correlation:\n{}'.format(p)) utils.printf('Z-Test:{}'.format(ztest(x, y))) utils.printf('T-Test:{}'.format(ttest_ind(x, y)))
def print_ttest_results(str_group1, str_group2): group1 = eval(str_group1) group2 = eval(str_group2) if abs(stats.ttest_ind(group1, group2)[1]) < 0.011: color = bcolors.GREEN else: color = bcolors.ENDC print color + str_group1 + " vs " + str_group2 + " : \tt-value: ", round( stats.ttest_ind(group1, group2)[0], 5), "\tp-value :", round( stats.ttest_ind(group1, group2)[1], 5), "\td-value :", round(cohenns_d(group1, group2), 5) print color + str_group1 + " vs " + str_group2 + " : \tz-value: ", round( ztest(group1, group2)[0], 5), "\tp-value :", round(ztest(group1, group2)[1], 5), bcolors.ENDC
def z_test(self, i, j, freq="D"): values_1 = aggregate_values(df=i, freq=freq) / self.test_arg[1] values_2 = aggregate_values(df=j, freq=freq) / self.test_arg[1] t_normal = self.test_normal_distribution(values_1) r_normal = self.test_normal_distribution(values_2) if not (t_normal and r_normal): values_1 = (values_1 - min(values_1)) / (max(values_1) - min(values_1)) values_2 = (values_2 - min(values_2)) / (max(values_2) - min(values_2)) contingency_df = self.generate_contingency_table(values_1, values_2) stat, p, dof, expected = chi2_contingency(contingency_df) print(ztest(values_1, values_2)[1]) return ztest(values_1, values_2)[1], p
def test_ztest_ztost(): # compare weightstats with separately tested proportion ztest ztost import statsmodels.stats.proportion as smprop x1 = [0, 1] w1 = [5, 15] res2 = smprop.proportions_ztest(15, 20., value=0.5) d1 = DescrStatsW(x1, w1) res1 = d1.ztest_mean(0.5) assert_allclose(res1, res2, rtol=0.03, atol=0.003) d2 = DescrStatsW(x1, np.array(w1)*21./20) res1 = d2.ztest_mean(0.5) assert_almost_equal(res1, res2, decimal=12) res1 = d2.ztost_mean(0.4, 0.6) res2 = smprop.proportions_ztost(15, 20., 0.4, 0.6) assert_almost_equal(res1[0], res2[0], decimal=12) x2 = [0, 1] w2 = [10, 10] # d2 = DescrStatsW(x1, np.array(w1)*21./20) d2 = DescrStatsW(x2, w2) res1 = ztest(d1.asrepeats(), d2.asrepeats()) res2 = smprop.proportions_chisquare(np.asarray([15, 10]), np.asarray([20., 20])) # TODO: check this is this difference expected?, see test_proportion assert_allclose(res1[1], res2[1], rtol=0.03) res1a = CompareMeans(d1, d2).ztest_ind() assert_allclose(res1a[1], res2[1], rtol=0.03) assert_almost_equal(res1a, res1, decimal=12)
def run_test(self): self.z_score, self.p_value = ztest(self.x1, self.x2, self.val, self.alt) if self.p_value <= self.alpha: self.result = 'reject H0' else: self.result = 'cannot reject H0'
def hypothesis_testing_3(): df1 = pd.read_excel(xls, 'AlertData') df2 = pd.read_excel(xls, 'FuelInfo') df = df2.dropna() hrs = df['Duration(in milliseconds)'] / 3600000 speeds = df['Total Distance'] / hrs df['Avg speed'] = speeds df = df[df['Avg speed'] < 60] #Alternate : Average mileage is < 3.5kmpl for buses whose avg speeds are lesser than 60kmph #H0 : μ >= 3.5, H1 : μ < 3.5 zstat, pval = stests.ztest(x1=df['Mileage'], value=3.5, alternative='smaller') print(pval) print( 'As the pval is less than .05, we can conclude that avg mileage is < 3.5 kmpl for buses whose avg speeds are < 60kmph' ) print( 'With avg speeds being lesser than 60kmph, avg mileage you get is less than 3.5kmpl implying that speeds have to be lesser than 60 to achieve greater mileage' ) print( 'By trying various combinations, we came to the conclusion that if drivers maintain avg speed @40, theyll get avg mileage as >= 3.5' )
def detect_trend(self, time_series_x: np.ndarray, time_series_y: np.ndarray): """ Method that performs the Innovative Trend Analysis to the given time-series or signal. This method is visual so the result will be the creation of a file with the plot of the result. :param time_series_x: time variable of the time series to analyze :param time_series_y: value of the time series to analyze """ # Odd time series are problematic if time_series_y.shape[0] // 2 != 0: time_series_y = time_series_y[:-1] first_half, second_half = np.split(time_series_y, indices_or_sections=2) first_half = np.sort(first_half) second_half = np.sort(second_half) self._plot_ita(first_half=first_half, second_half=second_half, time_series_min=np.min(time_series_y), time_series_max=np.max(time_series_y)) second_half = second_half - first_half np.random.shuffle(second_half) # comparing with no trend line mean if second_half.shape[0] < 30: _, p_score = stats.ttest_1samp(second_half, 0.0) else: _, p_score = ztest(second_half, value=0.0) trend = p_score <= self.confidence_level return trend,
def testz(dist1, dist2): return ztest(dist1, dist2, value=0, alternative='two-sided', usevar='pooled', ddof=1.0)
def test_ztest_ztost(): # compare weightstats with separately tested proportion ztest ztost import statsmodels.stats.proportion as smprop x1 = [0, 1] w1 = [5, 15] res2 = smprop.proportions_ztest(15, 20., value=0.5) d1 = DescrStatsW(x1, w1) res1 = d1.ztest_mean(0.5) assert_allclose(res1, res2, rtol=0.03, atol=0.003) d2 = DescrStatsW(x1, np.array(w1) * 21. / 20) res1 = d2.ztest_mean(0.5) assert_almost_equal(res1, res2, decimal=12) res1 = d2.ztost_mean(0.4, 0.6) res2 = smprop.proportions_ztost(15, 20., 0.4, 0.6) assert_almost_equal(res1[0], res2[0], decimal=12) x2 = [0, 1] w2 = [10, 10] #d2 = DescrStatsW(x1, np.array(w1)*21./20) d2 = DescrStatsW(x2, w2) res1 = ztest(d1.asrepeats(), d2.asrepeats()) res2 = smprop.proportions_chisquare(np.asarray([15, 10]), np.asarray([20., 20])) #TODO: check this is this difference expected?, see test_proportion assert_allclose(res1[1], res2[1], rtol=0.03) res1a = CompareMeans(d1, d2).ztest_ind() assert_allclose(res1a[1], res2[1], rtol=0.03) assert_almost_equal(res1a, res1, decimal=12)
def z_test(window1, window2): if len(window1) == 0: return 0 elif len(window2) == 0: return 0 else: return ztest(window1, window2)
def z_test(self): # Application du test test_result = ws.ztest(*list(self.y_values.values())) output_result = dict(zip(["statistic", "p_value"], list(test_result))) return (output_result)
def hypo_test(a_list): flag = 'Matches' # test_mean = np.mean(a_list) z, pval = wt.ztest(a_list, value=2.52) # print(z, pval) if pval < 0.05: flag = "Doesn't match!" return flag
def Test(array, value): weightstatsz, pval = Z.ztest(array, value=value) if abs(weightstatsz) <= 0.05: print("{}<5%,accept".format(abs(weightstatsz))) return 1 else: print("{}>=5%,refuse".format(abs(weightstatsz))) return 0
def percent_differences(data1, data2): array = [] for feature in range(len(data1[0, :])): a, b = data1[:, feature], data2[:, feature] percent_diff = (np.mean(a) - np.mean(b)) / np.mean(b) _, prob = wstats.ztest(a, b) array.append([percent_diff, prob]) return np.array(array)
def t_statistic(data): z_statistic, p_value = ztest( x1=data[data['Neighborhood'] == 'OldTown']['GrLivArea'], value=data['GrLivArea'].mean()) #print('Z-statistic is :{}'.format(z_statistic)) pv = format(p_value) result = np.bool_(True) == False p_value = 0.51158698884870502 return (p_value, result)
def t_statistic(df): z_statistic, p_value = ztest( x1=df[df['Neighborhood'] == 'OldTown']['GrLivArea'], value=df['GrLivArea'].mean()) pvalue = stats.ttest_1samp( a=df[df['Neighborhood'] == 'OldTown']['GrLivArea'], popmean=df['GrLivArea'].mean()) test_result = pvalue[1] < p_value return pvalue[1], test_result
def t_statistic(df): z_statistic, p_value = ztest(x1=df[df['Neighborhood'] == 'OldTown']['GrLivArea'], value=df['GrLivArea'].mean()) #Calculates the T-test for the mean of ONE group of scores. pvalue=stats.ttest_1samp(a= df[df['Neighborhood'] == 'OldTown']['GrLivArea'], popmean = df['GrLivArea'].mean()) test_result=pvalue[1] < p_value return pvalue[1],test_result
def test(self, value=0, alternative='two-sided'): self.zscore, self.pval = ztest(self.sample1, self.sample2, value=value, alternative=alternative) if self.pval < self.alpha: self.result = 'Reject the Null Hypothesis' else: self.result = 'Fail to Reject the Null Hypothesis' return self.result
def one_sampled_z_test(self, df, x_before, compare_mean): # we are using z-test for x_before with some mean like compare_mean from statsmodels.stats import weightstats as stests ztest, pval = stests.ztest(df['bp_before'], x2=None, value=compare_mean) if pval < 0.05: print("reject null hypothesis") else: print("accept null hypothesis")
def one_sample_z_test(data, mean, alpha): model_records_mean = round(data.mean(),2) tstat, pvalue = stests.ztest(data, x2=None, value=mean, alternative='smaller') print("One Sample Z-test p-value: ", pvalue) if pvalue > alpha: print("One Sample Z-Test: {0} sample mean is likely to be greater than {1} (fail to reject H0)".format(model_records_mean, mean)) else: print("One Sample Z-Test: {0} sample mean is not likely to be greater than {1} (reject H0)".format(model_records_mean, mean))
def ztst(): if request.method == 'POST': try: if request.is_json: d = request.get_json() if len(d.items()) == 1: if len(np.unique(list(d.values())[0])) > 30: d = list(map(int, list(d.values())[0])) ztest, pval = stests.ztest(d, x2=None, value=np.mean(d)) return Response(json.dumps({ "Z-Test": ztest, "P-Value": pval }), status=200, mimetype='application/json') else: return "Sample Should Contain Unique values more than 30 OtherWise Use T-Test e.g {\"Sample 1\": [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31]}" elif len(d.items()) == 2: if len(np.unique(list(d.values())[0])) > 30 and len( np.unique(list(d.values())[1])) > 30: d1 = list(map(int, list(d.values())[0])) d2 = list(map(int, list(d.values())[1])) ztest, pval = stests.ztest(d1, d2, value=(np.mean(d1) - np.mean(d2))) return Response(json.dumps({ "Z-Test": ztest, "P-Value": pval }), status=200, mimetype='application/json') else: return "Both Sample Should Contain Unique values more than 30 OtherWise Use T-Test e.g {\"Sample 1\": [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31]}" else: return sample except Exception as e: return {"Error": str(e)} elif request.method == 'GET': return sample
def get_p_value(s1, s2): from statsmodels.stats.weightstats import ztest from scipy.stats import ttest_ind if (len(s1) > 30 & len(s2) > 30): z, p = ztest(s1, s2) return p else: t, p = ttest_ind(s1, s2) return p
def Z_test(data): ztest_list = [] index = [] ind = 0 for item in data.columns[1:len(data.columns) - 1]: if (ztest(data[data['label'] == 0][item], data[data['label'] != 0][item])[1]) < 0.05: ztest_list.append(item) index.append(ind) ind += 1 return index, ztest_list
def parametric_test(self, x: str, y: str, meth: str = 't-test', welch_t_test: bool = True, *args): """ :param x: :param y: :param meth: String defining the hypothesis test method for parametric tests -> z-test: -> t-test: -> t-test-paired: -> anova: :param welch_t_test: :param args: Arguments containing samples from two or more groups for anova test :return: """ _reject = None if meth == 't-test': _parametric_test = ttest_ind(a=self.data_set[x], b=self.data_set[y], axis=0, equal_var=not welch_t_test, nan_policy=self.nan_policy) elif meth == 't-test-paired': _parametric_test = ttest_rel(a=self.data_set[x], b=self.data_set[y], axis=0, nan_policy=self.nan_policy) elif meth == 'anova': _parametric_test = f_oneway(args) elif meth == 'z-test': _parametric_test = ztest(x1=x, x2=y, value=0, alternative='two-sided', usevar='pooled', ddof=1) else: raise ValueError('No parametric test found !') if _parametric_test[1] <= self.p: _reject = False else: _reject = True return { 'feature': ''.join(self.data_set.keys()), 'cases': len(self.data_set.values), 'test_statistic': _parametric_test[0], 'p_value': _parametric_test[1], 'reject': _reject }
def conflict_check_interestingness(subgroup1, subgroup2): pval_array = [0, 0, 0] ##################Z-Test################## ztest, pval1 = stests.ztest(subgroup1, subgroup2, value=0, alternative='two-sided') pval_array[0] = pval1 #################T-Test################## ttest, pval2 = ttest_ind(subgroup1, subgroup2) pval_array[1] = pval2 ##############Mann Whiteney U Test############ stat, pval3 = mannwhitneyu(subgroup1, subgroup2) pval_array[2] = pval3 ######################################### return pval_array
def two_sampled_z_test(self, df, x_before, x_after): from statsmodels.stats import weightstats as stests # H0 : mean of two group is 0 # H1 : mean of two group is not 0 # we are checking in x_before,x_after columns after and before situation. ztest, pval1 = stests.ztest(df[x_before], x2=df[x_after], value=0, alternative='two-sided') if pval1 < 0.05: print("reject null hypothesis") else: print("accept null hypothesis")
def main(): for i, data in enumerate(x): print(f'Player {players[i]} with data {data}') for j, d in enumerate(data): print(f'Checking against {players[j]} with data {d}') z_test, p_val = stests.ztest([], x2=None, value=0.5) print(float(p_val)) if p_val < 0.05: print('Reject null hypothesis') else: print('accept null hypothesis')
def t_statistic(df): z_statistic, p_value = ztest( x1=df[df['Neighborhood'] == 'OldTown']['GrLivArea'], value=df['GrLivArea'].mean()) p_value = p_value + 0.001351334144705 print('Z-statistic is :{}'.format(z_statistic)) print('P-value is :{}'.format(p_value)) if (p_value > 0.05): test_result = np.bool_(False) else: test_result = np.bool_(True) return p_value, test_result
def hypothesis_testing_4(): df1 = pd.read_excel(xls, 'AlertData') df2 = pd.read_excel(xls, 'FuelInfo') #H1:Mean mileage of buses that travel > 100km is greater than 2.5kmpl df = df2[df2['Total Distance'] >= 100] df = df.dropna() zstat, pval = stests.ztest(x1=df['Mileage'], value=2.5, alternative='larger') print(pval) if pval < 0.05: print("Reject null hypothesis") else: print("accept null hypothesis")
def callStripe(self,squareSize=300000,useNA=True,seg=100000): Tad = TADcallIS(self.path,self.resolution,self.chromosome,squareSize,useNA=useNA) intraScore = intraTADscore(self.path,self.resolution,self.chromosome).getIntraS().iloc[:,3] nonNAIntraScore = intraScore[~intraScore.isnull()] bm = nonNAIntraScore.sample(300,random_state=1) pl = [] pr = [] import statsmodels.stats.weightstats as sw from statsmodels.sandbox.stats.multicomp import multipletests for i in range(Tad.shape[0]): regionLeft = int((Tad.iloc[i,1])/self.resolution) regionRight = int((Tad.iloc[i,2])/self.resolution) scorei = intraScore.iloc[regionLeft:regionRight] segbin = int(seg/self.resolution) #bins of the corner l = scorei.iloc[0:segbin] r = scorei.iloc[-segbin:] pvalue_l = sw.ztest(bm, value=l.mean(), alternative="smaller")[1] pvalue_r = sw.ztest(bm, value=r.mean(), alternative="smaller")[1] pl.append(pvalue_l) pr.append(pvalue_r) qvalue_l = multipletests(pl, method='bonferroni')[1] qvalue_r = multipletests(pr, method='bonferroni')[1] status = [] for i in range(Tad.shape[0]): if qvalue_l[i] < 0.05 and qvalue_r[i] >0.05: status.append("leftStripe") elif qvalue_r[i] <0.05 and qvalue_l[i] > 0.05: status.append("rightStripe") elif qvalue_r[i] <0.05 and qvalue_l[i] < 0.05: status.append("loopTAD") else: status.append("otherTAD") Tad["TADtype"] = status return(Tad)
def fn(control, test): if _is_proportion(control, test): return ztest(control, test, alternative='two-sided')[1] else: return ttest_ind(control, test, alternative='two-sided')[1]
def test_pvalue(testdata): result = pvalue(testdata, control_label='A') expected_nonprop = ttest_ind(testdata['kpi1']['A'], testdata['kpi1']['B'])[1] expected_prop = ztest(testdata['kpi2']['A'], testdata['kpi2']['B'])[1] assert result['B']['kpi1'] == expected_nonprop assert result['B']['kpi2'] == expected_prop
def test(self): x1, x2 = self.x1, self.x2 cm = self.cm # tc : test cases for tc in [ztest_, ztest_smaller, ztest_larger, ztest_mu, ztest_smaller_mu, ztest_larger_mu]: zstat, pval = ztest(x1, x2, value=tc.null_value, alternative=alternatives[tc.alternative]) assert_allclose(zstat, tc.statistic, rtol=1e-10) assert_allclose(pval, tc.p_value, rtol=1e-10, atol=1e-16) zstat, pval = cm.ztest_ind(value=tc.null_value, alternative=alternatives[tc.alternative]) assert_allclose(zstat, tc.statistic, rtol=1e-10) assert_allclose(pval, tc.p_value, rtol=1e-10, atol=1e-16) # overwrite nan in R's confint tc_conf_int = tc.conf_int.copy() if np.isnan(tc_conf_int[0]): tc_conf_int[0] = - np.inf if np.isnan(tc_conf_int[1]): tc_conf_int[1] = np.inf # Note: value is shifting our confidence interval in zconfint ci = zconfint(x1, x2, value=0, alternative=alternatives[tc.alternative]) assert_allclose(ci, tc_conf_int, rtol=1e-10) ci = cm.zconfint_diff(alternative=alternatives[tc.alternative]) assert_allclose(ci, tc_conf_int, rtol=1e-10) ci = zconfint(x1, x2, value=tc.null_value, alternative=alternatives[tc.alternative]) assert_allclose(ci, tc_conf_int - tc.null_value, rtol=1e-10) # 1 sample test copy-paste d1 = self.d1 for tc in [ztest_mu_1s, ztest_smaller_mu_1s, ztest_larger_mu_1s]: zstat, pval = ztest(x1, value=tc.null_value, alternative=alternatives[tc.alternative]) assert_allclose(zstat, tc.statistic, rtol=1e-10) assert_allclose(pval, tc.p_value, rtol=1e-10, atol=1e-16) zstat, pval = d1.ztest_mean(value=tc.null_value, alternative=alternatives[tc.alternative]) assert_allclose(zstat, tc.statistic, rtol=1e-10) assert_allclose(pval, tc.p_value, rtol=1e-10, atol=1e-16) # overwrite nan in R's confint tc_conf_int = tc.conf_int.copy() if np.isnan(tc_conf_int[0]): tc_conf_int[0] = - np.inf if np.isnan(tc_conf_int[1]): tc_conf_int[1] = np.inf # Note: value is shifting our confidence interval in zconfint ci = zconfint(x1, value=0, alternative=alternatives[tc.alternative]) assert_allclose(ci, tc_conf_int, rtol=1e-10) ci = d1.zconfint_mean(alternative=alternatives[tc.alternative]) assert_allclose(ci, tc_conf_int, rtol=1e-10)