def CI_ttest(X1, X2): try: cm = sms.CompareMeans(sms.DescrStatsW(X1), sms.DescrStatsW(X2)) out = cm.tconfint_diff(usevar='unequal') return '[%.2f, %.2f]' % (out[0], out[1]) except: return 'error'
def lat_pattern(data): # data = ds.deltacloud.values lat_width = 1 # default is 1 degree dim = data.shape #[3600,7200] # row, coloum res = 180 / dim[0] lat_number = int(lat_width / res) # number of 0.05 pixels to be aggregated lat = np.zeros([6, int(dim[0] / lat_number)]) lat[0, :] = np.arange(-90 + lat_width / 2, 90 + lat_width / 2, lat_width) k = 0 for i in range(0, int(dim[0] - lat_number), lat_number): temp = data[i:i + int(lat_number), :].flatten() temp = temp[~np.isnan(temp)] lat[1, k] = temp.shape[0] # sample number lat[2, k] = np.mean(temp) # difference if stats.ttest_1samp(temp, 0).pvalue < 0.05: lat[3, k] = 1 # 1: significant; -1 not significant else: lat[3, k] = -1 lat[4, k], lat[5, k] = sms.CompareMeans( sms.DescrStatsW(temp), # lower and upper CI sms.DescrStatsW(np.zeros( temp.shape))).tconfint_diff(usevar='unequal') k = k + 1 return lat
def make_stats_row_from_df(cur_data, include_power, effect_size = None, alpha = None): '''Calculates output statistics given the data frame cur_data. If include_power, includes the power calculation. efffect_size and alpha are only required/used if power is calculated ''' cur_row = {} sample_sizes = [np.sum(cur_data[action_header] == i) for i in range(1,3)] #calculate sample size and mean cur_row['sample_size_1'] = sample_sizes[0] cur_row['sample_size_2'] = sample_sizes[1] cur_row['mean_1'] = np.mean(cur_data[cur_data[action_header] == 1][obs_reward_header]) cur_row['mean_2'] = np.mean(cur_data[cur_data[action_header] == 2][obs_reward_header]) #calculate total reward cur_row['total_reward'] = np.sum(cur_data[obs_reward_header]) #calculate power cur_row['ratio'] = sample_sizes[0] / sample_sizes[1] if include_power: cur_row['power'] = statsmodels.stats.power.tt_ind_solve_power(effect_size, cur_row['sample_size_1'], alpha, None, cur_row['ratio']) cur_row['actual_es'] = calculate_effect_size(cur_data[cur_data[action_header] == 1][obs_reward_header], cur_data[cur_data[action_header] == 2][obs_reward_header]) #calculate ttest comparer = sms.CompareMeans(sms.DescrStatsW(cur_data[cur_data[action_header] == 1][obs_reward_header]), sms.DescrStatsW(cur_data[cur_data[action_header] == 2][obs_reward_header])) cur_row['stat'], cur_row['pvalue'], cur_row['df'] = comparer.ttest_ind(usevar = 'pooled') cur_row['statUnequalVar'], cur_row['pvalueUnequalVar'], cur_row['dfUnequalVar'] = comparer.ttest_ind(usevar = 'unequal') # cur_row['statSP'], cur_row['pvalueSP'] = stats.ttest_ind(cur_data[cur_data[action_header] == 1][obs_reward_header], cur_data[cur_data[action_header] == 2][obs_reward_header], equal_var = ASSUME_EQUAL_VAR) # cur_row['statOppSP'], cur_row['pvalueOppSP'] = stats.ttest_ind(cur_data[cur_data[action_header] == 1][obs_reward_header], cur_data[cur_data[action_header] == 2][obs_reward_header], equal_var = not ASSUME_EQUAL_VAR) return cur_row
def ttest_unit(control, treatment): tstat, pvalue = stats.ttest_ind(control, treatment, equal_var=False) cm = sms.CompareMeans(sms.DescrStatsW(control), sms.DescrStatsW(treatment)) conf_interval = cm.tconfint_diff(usevar='unequal') print("T-statistics = %s\n" %tstat) print("p-value = %s\n" %pvalue) print("95% confidence Interval = ") print(conf_interval)
def differential_methylation(meths_x, meths_y, req_inds): ## Here meths_x and meths_y have filter_pos_ix import statsmodels.stats.api as sms permeths_x = meths_x.get_permeths(meths_x.filter_pos_ix[req_inds]) permeths_y = meths_y.get_permeths(meths_y.filter_pos_ix[req_inds]) cm = sms.CompareMeans(sms.DescrStatsW(permeths_x), sms.DescrStatsW(permeths_y)) return (cm.ttest_ind())
def calc_stats(self, data, real=False): if not real: self.fake_stats = [] # calculate initial energy distribution input = xgb.DMatrix(data[:, self.mid - 2:self.mid + 2, self.mid - 2:self.mid + 2].reshape(-1, 16)) self.e_init = self.regressor.predict(input) hist_e_init = np.histogram(self.e_init, self.n_bins, normed=True)[0] if real: self.real_stats.append(hist_e_init) else: self.fake_stats.append(hist_e_init) # calculate normalized energy stds over calo areas data_norm = data / self.e_init.reshape(-1, 1, 1) e_calo_norm_std = np.empty((self.n_bins, self.radius)) # calculate Ei/E0 e_i = self.ei_by_e0(data) if real: self.e_i = e_i e_i_mean = np.empty((self.n_bins, self.radius - 1)) e_i_cint = np.empty((self.n_bins, self.radius - 1, 2)) # calculate RMS of (E_calo / E_true) / <E_calo / E_true> e_calo_by_e_init = data.sum((1, 2)) / self.e_init e_dim_rms = np.empty((self.n_bins)) for i in range(self.n_bins): idx = np.where((self.e_init >= self.e_bins[i]) * (self.e_init < self.e_bins[i + 1]))[0] for r in range(1, self.radius + 1): # energy stds tmp = data_norm[idx] e_calo_norm_std[i, r - 1] = tmp[self.masks[r - 1].repeat( tmp.shape[0], 0)].std() if r < self.radius: # Ei/E0 e_i_mean[i, r - 1] = e_i[idx, r - 1].mean() if not real: cm = sms.CompareMeans( sms.DescrStatsW(self.e_i[idx, r - 1]), sms.DescrStatsW(e_i[idx, r - 1])) e_i_cint[i, r - 1, :] = cm.tconfint_diff(usevar='unequal') # RMS tmp = e_calo_by_e_init[idx] tmp /= tmp.mean() e_dim_rms[i] = (tmp**2).mean()**0.5 if real: self.real_stats.append(e_calo_norm_std) self.e_i_mean = e_i_mean self.real_stats.append(e_i_mean) self.real_stats.append(e_dim_rms) else: self.fake_stats.append(e_calo_norm_std) e_i_cint = e_i_cint - self.e_i_mean[:, :, None] + e_i_mean[:, :, None] e_i_cint[:, :, 0] *= -1 self.fake_stats.append(e_i_mean) self.fake_stats.append(e_i_cint) self.fake_stats.append(e_dim_rms)
def compute_mean_diff(c): """ computes the confidence interval between two series """ cm = sms.CompareMeans( sms.DescrStatsW( c.query("{} == 1".format(self.w_var[0]))[self.y_var[0]]), sms.DescrStatsW( c.query("{} == 0".format(self.w_var[0]))[self.y_var[0]])) return cm.tconfint_diff(usevar='unequal')
def channel_compare(self, data): data_type1 = data[data['sales_channel'] == 'auction_type1']['sell_days'] data_type1 = data_type1.dropna() data_type2 = data[data['sales_channel'] == 'auction_type2']['sell_days'] data_type2 = data_type2.dropna() t, p_value = ttest_ind(data_type1, data_type2) cm = sms.CompareMeans(sms.DescrStatsW(data_type2), sms.DescrStatsW(data_type1)) print cm.tconfint_diff(usevar='unequal') if p_value < 0.05: print('type1 is sold faster than from type2: ' + str(p_value)) else: print('type1 is not significantly different from type2: ' + str(p_value)) return p_value
def calcConfidenceInterval(self): x1 = np.divide([26.63, 22.27, 41.38, 39.06], 100) x2 = np.divide([18.42, 41.38, 34.55, 17.39], 100) #x1 = [22.66, 30.84, 40.70, 2.26] #x2 = [18.42, 18.97, 23.64, 10.87] #x1 = [14.97, 14.20, 19.97, 19.49] #x2 = [13.16, 13.79, 12.73, 32.61] #x1 = [37.74, 27.50, 17.17, 39.19] #x2 = [50, 25.86, 29.09, 39.13] cm = sms.CompareMeans(sms.DescrStatsW(x1), sms.DescrStatsW(x2)) a, b = cm.tconfint_diff(usevar='unequal') print(a.round(2), b.round(2))
def comparemeans(var1,var2,alpha=0.05,alternative='two-sided'): ''' Compare means based on 2 sample datas :param var1: dataframe var 1 :param var2: dataframe var 2 :param alternative: H0 != 0 (two-sided) H0 > 0 (larger) H0 < 0 (smaller) :param alpha: significance level :return: ''' cm = smstats.CompareMeans(smstats.DescrStatsW(var1), smstats.DescrStatsW(var2)) ci = cm.tconfint_diff(alpha=alpha, alternative=alternative) print("{0}Confidence Interval - Compare Means{0}".format("="*5)) print("="*50) print("LCI:\t{0}\nUCI:\t{1}\n\nconfidence interval:\t{2}".format(ci[0], ci[1], ci)) print("="*50) return ci
def two_means_diff_conf_interval(values1: np.ndarray, values2: np.ndarray, conf_level: float, pooled: bool = False) -> tuple: """Calculates confidence interval for the difference of two means Args: values1 (np.array): sample 1 values values2 (np.array): sample 2 values conf_level (float): confidence level pooled (bool, optional): whether to calculate pooled std. Defaults to False. Returns: tuple: lower and upper values of confidence interval """ cm = sms.CompareMeans(sms.DescrStatsW(values1), sms.DescrStatsW(values2)) alpha = 1 - conf_level diff_ci = cm.tconfint_diff(usevar='pooled' if pooled else "unequal", alpha=alpha) return diff_ci
def alternative_2samp_test_for_same_mean(dataset1, dataset2, confidence_level: float): # Test if the population means are equal assert confidence_level > 0.8 import statsmodels.stats.api as sms cm = sms.CompareMeans(sms.DescrStatsW(dataset1), sms.DescrStatsW(dataset2)) #note sms.DescrStatsW().tconfint_mean() and sms.DescrStatsW() are DIFFERENT! stat, p = cm.tconfint_diff(alpha=1 - confidence_level, usevar='pooled') print( 'Assuming that two datasets are normally distributed and independent, result of an alternative test:\n p == {}' .format(p), end='') if p > significance_level: print( ' > {}, fail to reject H0 (that the population means of two datasets are equal)' .format(significance_level)) else: print( ' <= {}, reject H0 (that the population means of two datasets are equal)' .format(significance_level))
# 1번 # In[6]: import statsmodels.stats.api as sms # In[7]: holiday1 = day[day['holiday'] == 1] holiday0 = day[day['holiday'] == 0] # In[8]: import scipy as sp cm = sms.CompareMeans(sms.DescrStatsW(holiday1.cnt), sms.DescrStatsW(holiday0.cnt)) cm.ttest_ind(usevar='pooled') # In[9]: cm.tconfint_diff(usevar='pooled') # 2번 # 차이X p-value가 0.05이상->대립가설 반박. # 신뢰구간에 0 포함->이를 지지 # In[10]: import matplotlib.pyplot as plt # In[11]:
'gender': get_patient_sex(scan) }) break gender_df = pd.DataFrame(gender_results) df = pd.read_csv(f"{SIMPLE_MULTIPLE_WINDOWS}_by_patient_result.csv").merge( gender_df) print(df.gender.value_counts()) male_df = df[df.gender == 'M'] female_df = df[df.gender == 'F'] plt.figure() male_df.dice.hist(bins=20) plt.savefig('male.png') plt.figure() female_df.dice.hist(bins=20) plt.savefig('female.png') print(f"Male var: {male_df.dice.var()}; Female var: {female_df.dice.var()}") print('t-test (tstat, pvalue, df)') print(sms.ttest_ind(male_df.dice.values, female_df.dice.values)) cm = sms.CompareMeans(sms.DescrStatsW(male_df.dice.values), sms.DescrStatsW(female_df.dice.values)) print(f"CI: {cm.tconfint_diff(usevar='unequal')}")
##### Two sample t test for two groups t, p = ttest_ind(a, b, equal_var=False) print("ttest_ind: t = %g p = %g" % (t, p)) #ttest_ind: t = 5.71367 p = 1.97862e-06 # one tail p = p/2 t, p = ttest_ind([e-30000 for e in a], b, equal_var=False) print("ttest_ind 30k: t = %g p = %g" % (t, p)) t, p = ttest_ind([e-32000 for e in a], b, equal_var=False) print("ttest_ind 32k: t = %g p = %g" % (t, p)) t, p = ttest_ind([e-33000 for e in a], b, equal_var=False) print("ttest_ind 33k: t = %g p = %g" % (t, p)) t, p = ttest_ind([e-33000 for e in a], b, equal_var=False) print("ttest_ind 34k: t = %g p = %g" % (t, p)) t, p = ttest_ind([e-35000 for e in a], b, equal_var=False) print("ttest_ind 35k: t = %g p = %g" % (t, p)) t, p = ttest_ind([e-40000 for e in a], b, equal_var=False) print("ttest_ind 40k: t = %g p = %g" % (t, p)) ##95% confidence interval for difference of sample mean of two groups cm = sms.CompareMeans(sms.DescrStatsW(a), sms.DescrStatsW(b)) print (cm.tconfint_diff(usevar = 'unequal'))
# Сравним ошибки линейной регрессии и случайного леса на тестовой выборке: # In[361]: plt.figure() plt.hist(abs(y_test - lm.predict(X_test)) - abs(y_test - rf.predict(X_test)), bins=15, normed=True) plt.xlabel('Difference of absolute errors') plt.show() # Различия между средними абсолютными ошибками значимы: # In[362]: tmeans = sm.CompareMeans(sm.DescrStatsW(abs(y_test - lm.predict(X_test))), sm.DescrStatsW(abs(y_test - rf.predict(X_test)))) print('Средняя разность абсолютных ошибок: %f' % np.mean( abs(y_test - lm.predict(X_test)) - abs(y_test - rf.predict(X_test)))) tmeans.ttest_ind(alternative='two-sided', usevar='pooled', value=0)[1] # 95% доверительный интервал для средней разности абсолютных ошибок: # In[363]: tmeans.tconfint_diff(alpha=0.05, alternative='two-sided', usevar='pooled') # Посмотрим, какие признаки обладают наибольшей предсказательной способностью: # In[364]: importances = pd.DataFrame(
#Initialize a dataframe with test stats test_stats = pd.DataFrame(columns = ['pct_lft','conf_int_lb','conf_int_rb','p-value']) #Concatenate the test stats with both the summaries test_summary1 = pd.concat([test_summary1,test_stats],axis=1,ignore_index=False,sort=False) #Calculate pct_lift for all the metrics test_summary1['pct_lft'] = (test_summary1['TestB']/test_summary1['TestA'])-1 test_summary2 = pd.concat([test_summary2,test_stats],axis=1,ignore_index=False,sort=False) #Calculate pct_lift for all the metrics test_summary2['pct_lft'] = (test_summary2['TestB']/test_summary2['TestA'])-1 #Calculate the test stats for i in test_summary2.index: #Comparing means cm = sms.CompareMeans(sms.DescrStatsW(test_data_A_clean[i][test_data_A_clean[i].notnull()]), sms.DescrStatsW(test_data_B_clean[i][test_data_B_clean[i].notnull()])) #Extracting left boundary and right boundary lb,rb = cm.tconfint_diff(usevar='unequal',alternative='two-sided',alpha = 0.10) #Convert the lb and rb to lb lift and rb lift test_summary2.at[i,'conf_int_lb'] = (rb*-1)/test_data_A_clean[i].mean() test_summary2.at[i,'conf_int_rb']= (lb*-1)/test_data_A_clean[i].mean() #p-value t_stat,test_summary2.at[i,'p-value'] = st.ttest_ind(test_data_A_clean[i][test_data_A_clean[i].notnull()], test_data_B_clean[i][test_data_B_clean[i].notnull()],equal_var = False) print(test_summary2)
mean = dataseta.mean() # std=dataseta.std() interval = stats.t.interval(0.95, len(datasetb) - 1, mean, stddev2) print interval # print levene(dataseta, datasetb) # # # print ttest_ind(dataseta, datasetb, equal_var=True) # print ttest_ind(dataseta, datasetb, equal_var=False) from scipy.stats import levene print "====%%%%%===", levene(dataseta, datasetb, center='trimmed') cm = sms.CompareMeans(sms.DescrStatsW(dataseta), sms.DescrStatsW(datasetb)) print cm.tconfint_diff(alpha=0.05, usevar='pooled') cm = sms.CompareMeans(sms.DescrStatsW(dataseta), sms.DescrStatsW(datasetb)) print cm.tconfint_diff(alpha=0.05, usevar='unequal') # print (stats.t.ppf(1-0.05, 5)) import statsmodels.api as sm print "+++", sm.stats.ttest_ind(dataseta, datasetb, usevar='pooled') print "+++", sm.stats.ttest_ind(dataseta, datasetb, usevar='unequal') # interval=stats.t.interval(0.95,len(dataseta)-1,mean,stddev1) # print interval (statistic, pvalue) = stats.ttest_ind_from_stats(mean1=mean1, std1=stddev1,
# In[4]: #1번 문제 data = pd.read_csv('C:/Users/USER/Desktop/test/telecom.csv') data = data.dropna() data = data[data.CHURNED != 'InVol'] data['CHURNED_NEW'] = np.where(data['CHURNED'] == 'Current', 'No', 'Yes') data # In[5]: #2번 문제 new_yes = data[data['CHURNED_NEW'] == 'Yes'].LOCAL new_no = data[data['CHURNED_NEW'] == 'No'].LOCAL cm = sms.CompareMeans(sms.DescrStatsW(new_yes), sms.DescrStatsW(new_no)) print(cm.ttest_ind(usevar='pooled')) print(cm.tconfint_diff()) # In[60]: #3번 문제 x = data[[ 'LONGDIST', 'International', 'LOCAL', 'AGE', 'CHILDREN', 'Est_Income' ]] y = data['CHURNED_NEW'] y_ohe = pd.get_dummies(y, drop_first=True) X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2,
from sklearn.metrics import silhouette_score from scipy.cluster.hierarchy import linkage, dendrogram from scipy.cluster.hierarchy import fcluster from sklearn.cluster import AgglomerativeClustering from sklearn.preprocessing import StandardScaler data = pd.read_csv('C:/Users/USER/Desktop/test/facebook.csv') data = data.dropna() #data.isnull().any().any() #nan 값이 있는지 반환 # #### (2) Photo와 Video 포스트에 대한 좋아요 수 평균에 차이가 있는 지를 통계적으로 검증하시오. (5점) # In[102]: Photo_mean = data[data['Type'] == 'Photo'].Like Video_mean = data[data['Type'] == 'Video'].Like cm = sms.CompareMeans(sms.DescrStatsW(Photo_mean), sms.DescrStatsW(Video_mean)) print(cm.ttest_ind(usevar='unequal')) print(cm.tconfint_diff()) print('t 값이 0.5이하가 되고, 유의수준이 0.05를 넘어가기 때문에 두 집단의 평균에는 차이가 없다고 본다.') # #### (3) 월요일부터 금요일까지는 1 값을 갖고, 토요일, 일요일은 0 값을 갖는 day 라는 컬럼을 생성하고, day 값이 1인 경우와 0인 경우의 포스트가 노출된 사용자 수 평균에 차이가 있는 지를 통계적으로 검증하시오. (8점) # In[101]: #data['day']=np.where((data['Weekday']==6)|(data['Weekday']==7),0,1) # 둘 중 아무거나 사용가능 data['day'] = data['Weekday'].apply(lambda x: 0 if (x == 6) | (x == 7) else 1) day1_mean = data[data['day'] == 1].Impressions day2_mean = data[data['day'] == 0].Impressions cm = sms.CompareMeans(sms.DescrStatsW(day1_mean), sms.DescrStatsW(day2_mean)) print(sp.stats.ttest_ind(day1_mean, day2_mean, equal_var=False)) print(cm.ttest_ind(usevar='unequal'))
def two_population(a, b, alpha=.05, consistency='equal', option='right', show_table=False, stages=[1, 2, 3], show=True, precision=4, matched_pairs=False): """ + [First stage]: F Statistics - consistency: equal, left (1 is more consistent than 2), right (2 is more consistent than 1) + [Second stage]: t Test + [Third stage]: Confidence Interval Will return a result_dict regardless of stages. """ opt = option.lower()[0] results = "" const = consistency.lower()[0] result_dict = dict() df_1 = len(a) - 1 df_2 = len(b) - 1 if 1 in stages: varall = [stats.describe(a).variance, stats.describe(b).variance] f_value = varall[0] / varall[1] result_dict['varall'] = varall result_dict['f_value'] = f_value ptmp = stats.f.cdf(f_value, df_1, df_2) if const == 'e': if ptmp > 0.5: ptmp = 1 - ptmp p_value = ptmp * 2 rej_upper = stats.f.ppf(1 - alpha / 2, df_1, df_2) rej_lower = stats.f.ppf(alpha / 2, df_1, df_2) result_dict['f_rej_upper'] = rej_upper result_dict['f_rej_lower'] = rej_lower if f_value < rej_lower or f_value > rej_upper: flag = True else: flag = False text = 'unequal variances' else: rej_upper = stats.f.ppf(1 - alpha, df_1, df_2) rej_lower = stats.f.ppf(alpha, df_1, df_2) if const == 'r': result_dict['f_rej_upper'] = rej_upper p_value = 1 - ptmp if f_value > rej_upper: flag = True else: flag = False text = 'σ_1/σ_2 > 1' else: result_dict['f_rej_lower'] = rej_lower p_value = ptmp if f_value < rej_lower: flag = True else: flag = False text = 'σ_1/σ_2 < 1' result_dict['p_value'] = p_value results = f""" F Statistics =================================== F statistic = {f_value:.{precision}f} p-value = {p_value:.{precision}f} ({inter_p_value(p_value)}) Reject H_0 ({text}) → {flag} """ if 2 in stages: if matched_pairs: samp_diff = a - b nobs = samp_diff.shape[0] df = nobs - 1 tmpdesc = stats.describe(samp_diff) t_value = tmpdesc.mean / (tmpdesc.variance**0.5) * (nobs**0.5) # p-values ptmp = stats.t.cdf(t_value, df) if opt == 'r': text = 'one-tail' tcv = stats.t.ppf(1 - alpha, df=df) p_value = 1 - ptmp elif opt == 'l': text = 'one-tail' p_value = ptmp tcv = stats.t.ppf(alpha, df=df) else: text = 'two-tail' tcv = stats.t.ppf(1 - alpha / 2, df=df) if ptmp > 0.5: ptmp = 1 - ptmp p_value = ptmp * 2 flag = p_value < alpha results += f""" t Test =================================== t (Observed value) = {t_value:.{precision}f} p-value ({text}) = {p_value:.{precision}f} ({inter_p_value(p_value)}) t (Critical, ({text})) = {tcv:.{precision}f} DF = {(df):.{precision}f} Reject H_0 → {flag} """ result_dict['t_p_value'] = p_value result_dict['t_critical_value'] = tcv result_dict['t_observed_value'] = t_value t_alpha = stats.t.ppf(1 - alpha / 2, df) std_xbar = (tmpdesc.variance / nobs)**0.5 LCL = tmpdesc.mean - t_alpha * std_xbar UCL = tmpdesc.mean + t_alpha * std_xbar con_coef = 1 - alpha conf_interval = [LCL, UCL] result_dict['conf_interval'] = conf_interval results += f""" Confidence Interval =================================== {con_coef * 100:.1f}% Confidence Interval: [{LCL:.{precision}f}, {UCL:.{precision}f}] """ else: if flag: # True == unequal variance ttest_result = stats.ttest_ind(a, b, equal_var=False) t_summary = list(ttest_result) t_critical_two = stats.t.ppf(1 - alpha / 2, df=(df_1 + df_2)) if opt == 'r': t_critical_one = stats.t.ppf(1 - alpha, df=(df_1 + df_2)) result_dict['t_critical_one'] = t_critical_one elif opt == 'l': t_critical_one = stats.t.ppf(alpha, df=(df_1 + df_2)) result_dict['t_critical_one'] = t_critical_one if opt == 't': flag = t_summary[1] < alpha result_dict['t_critical_two'] = t_critical_two result_dict['t_observed_value'] = t_summary[0] result_dict['t_p_value'] = t_summary[1] result_dict['df'] = df_1 + df_2 results += f""" t Test =================================== t (Observed value) = {t_summary[0]:.{precision}f} p-value (two-tail) = {t_summary[1]:.{precision}f} ({inter_p_value(t_summary[1])}) t (Critical, two-tail) = {t_critical_two:.{precision}f} DF = {(df_1 + df_2):.{precision}f} Reject H_0 → {flag} """ else: flag = t_summary[1] / 2 < alpha result_dict['t_observed_value'] = t_summary[0] result_dict['t_p_value'] = t_summary[1] / 2 result_dict['df'] = df_1 + df_2 results += f""" t Test =================================== t (Observed value) = {t_summary[0]:.{precision}f} p-value (one-tail) = {(t_summary[1] / 2):.{precision}f} ({inter_p_value(t_summary[1] / 2)}) t (Critical, one-tail) = {t_critical_one:.{precision}f} DF = {(df_1 + df_2):.{precision}f} Reject H_0 → {flag} """ if 3 in stages: cm_result = sms.CompareMeans(sms.DescrStatsW(a), sms.DescrStatsW(b)) conf_table = cm_result.summary(usevar='unequal', alpha=alpha) conf_interval = list( map(float, conf_table.as_text().split('\n')[4].split()[6:])) con_coef = 1 - alpha # record result result_dict['conf_interval'] = conf_interval results += f""" Confidence Interval =================================== {con_coef * 100:.1f}% Confidence Interval: [{conf_interval[0]:.{precision}f}, {conf_interval[1]:.{precision}f}] """ else: ttest_result = stats.ttest_ind(a, b, equal_var=True) t_summary = list(ttest_result) t_critical_two = stats.t.ppf(1 - alpha / 2, df=(df_1 + df_2)) if opt == 'r': t_critical_one = stats.t.ppf(1 - alpha, df=(df_1 + df_2)) result_dict['t_critical_one'] = t_critical_one elif opt == 'l': t_critical_one = stats.t.ppf(alpha, df=(df_1 + df_2)) result_dict['t_critical_one'] = t_critical_one if opt == 't': flag = t_summary[1] < alpha result_dict['t_critical_two'] = t_critical_two result_dict['t_observed_value'] = t_summary[0] result_dict['t_p_value'] = t_summary[1] result_dict['df'] = df_1 + df_2 results += f""" t Test =================================== t (Observed value) = {t_summary[0]:.{precision}f} p-value (two-tail) = {t_summary[1]:.{precision}f} ({inter_p_value(t_summary[1])}) t (Critical, two-tail) = {t_critical_two:.{precision}f} DF = {(df_1 + df_2):.{precision}f} Reject H_0 → {flag} """ else: flag = t_summary[1] / 2 < alpha result_dict['t_observed_value'] = t_summary[0] result_dict['t_p_value'] = t_summary[1] result_dict['df'] = df_1 + df_2 results += f""" t Test =================================== t (Observed value) = {t_summary[0]:.{precision}f} p-value (one-tail) = {(t_summary[1] / 2):.{precision}f} ({inter_p_value(t_summary[1] / 2)}) t (Critical, one-tail) = {t_critical_one:.{precision}f} DF = {(df_1 + df_2):.{precision}f} Reject H_0 → {flag} """ if 3 in stages: cm_result = sms.CompareMeans(sms.DescrStatsW(a), sms.DescrStatsW(b)) conf_table = cm_result.summary(usevar='pooled', alpha=alpha) conf_interval = list( map(float, conf_table.as_text().split('\n')[4].split()[6:])) # record result result_dict['conf_interval'] = conf_interval con_coef = 1 - alpha results += f""" Confidence Interval =================================== {con_coef * 100:.1f}% Confidence Interval: [{conf_interval[0]:.{precision}f}, {conf_interval[1]:.{precision}f}] """ if show_table == True and 3 in stages: results += f"""{conf_table.as_text()}""" if show == True: print(results) return result_dict
def get_prepost_stats(pre, post, test_group_column='test_group', experiment_unit='event_date', control='CONTROL', test='TEST', alpha=0.05, printerror=True): """Generate Pre-Post statistics given 2 dataframes from Pre and Post periods with test and control groups. # Arguments pre (DataFrame): Pandas dataframe for pre data, must include control and test groups. post (DataFrame): Pandas dataframe for post data, must include control and test groups. experiment_unit (str): Experiment unit for stats. test_group_column: Column used to identify test and control groups. control (str): Name of control group in test_group_column. Default: CONTROL. test (str): Name of test group in test_group_column. Default: TEST. alpha (float): Significance level for calculating p-value and confidence intervals. # Returns Dataframe for each metric with pre-post summary and statistics. """ df = {} df['pre'] = pre df['post'] = post metrics = df['pre'].drop([test_group_column, experiment_unit], axis=1).columns results = [] for metric in metrics: # if (pre[metric].count()<=2)|(pre[metric].count()<=1): # print('Insufficient data: '+ metric) # continue #Skip empty results stats = {'pre':{}, 'post':{}} try: for i in ('pre', 'post'): stats[i]=get_relative_diff(df[i], i, metric)['stats'] cm = sms.CompareMeans(stats['post']['desc'], stats['pre']['desc']) ci = cm.tconfint_diff(usevar='unequal') t, p, dof = cm.ttest_ind(usevar='unequal') power = tt_ind_solve_power(effect_size=t, nobs1=stats['pre']['nobs'], ratio=stats['post']['nobs']/stats['pre']['nobs'], alpha=0.05) except Exception as e: if printerror is True: print(e) #print('Insufficient data: '+ metric) continue #Skip empty results results.append( OrderedDict( metric=metric, pre_days=stats['pre']['nobs'], pre_control_metric_sum=stats['pre']['control_metric_sum'], pre_test_metric_sum=stats['pre']['test_metric_sum'], pre_control_metric_mean=stats['pre']['control_metric_mean'], pre_test_metric_mean=stats['pre']['test_metric_mean'], pre_delta_mean=stats['pre']['mean'], pre_delta_lcl=stats['pre']['metric_delta_lcl'], pre_delta_ucl=stats['pre']['metric_delta_ucl'], post_days=stats['post']['nobs'], post_control_metric_sum=stats['post']['control_metric_sum'], post_test_metric_sum=stats['post']['test_metric_sum'], post_control_metric_mean=stats['post']['control_metric_mean'], post_test_metric_mean=stats['post']['test_metric_mean'], post_delta_mean=stats['post']['mean'], post_delta_lcl=stats['post']['metric_delta_lcl'], post_delta_ucl=stats['post']['metric_delta_ucl'], prepost_delta=stats['post']['mean']-stats['pre']['mean'], prepost_delta_lcl=ci[0], prepost_delta_ucl=ci[1], prepost_delta_plus_minus=(ci[1]-ci[0])/2, prepost_delta_pvalue=p, net_impact=stats['post']['control_metric_mean']*(stats['post']['mean']-stats['pre']['mean']), net_lcl=stats['post']['control_metric_mean']*(1+ci[0]), net_ucl=stats['post']['control_metric_mean']*(1+ci[1]), net_plus_minus=stats['post']['control_metric_mean']*(ci[1]-ci[0])/2, power=power, )) results_df = pd.DataFrame(results) try: results_df['Prepost Delta w/CI (%)']=results_df[['prepost_delta','prepost_delta_plus_minus']].apply(lambda row: '{0:+.2f}\u00B1{1:.2f}%'.format(*row*100) if not(pd.isnull(row[0])) else '-' , axis=1) except: results_df['Prepost Delta w/CI (%)']='-' results_df.rename(columns=dict( metric='Metric', pre_days='Pre Days', pre_control_metric_sum="Pre Control Metric Sum", pre_test_metric_sum="Pre Test Metric Sum", pre_control_metric_mean="Pre Control Metric Mean", pre_test_metric_mean="Pre Test Metric Mean", pre_delta_mean="Pre Delta (%)", pre_delta_lcl="Pre Delta LCL (%)", pre_delta_ucl="Pre Delta UCL (%)", post_days="Post Days", post_control_metric_sum="Post Control Metric Sum", post_test_metric_sum="Post Test Metric Sum", post_control_metric_mean="Post Control Metric Mean", post_test_metric_mean="Post Test Metric Mean", post_delta_mean="Post Delta (%)", post_delta_lcl="Post Delta LCL (%)", post_delta_ucl="Post Delta UCL (%)", prepost_delta="PrePost Delta (%)", prepost_delta_lcl="PrePost Delta LCL (%)", prepost_delta_ucl="PrePost Delta UCL (%)", prepost_delta_pvalue="p-value", net_impact="Net Impact", power='Power (%)' ) , inplace=True) #Extra calculations return results_df
plt.ylabel('Log2 Fold change',fontsize=25) plt.xticks(range(1,len(experiment)*2,2),experiment,rotation=90,fontsize=20) plt.yticks(fontsize=20) plt.legend(handles=[patch_msn24_targets,patch_not_msn24_targets]) plt.tight_layout() plt.savefig('%s/msn24_ko_heat_shock_vs_steady_state.pdf'%fig_dir) effect=[] pval=[] se=[] for i in range(len(diff_msn24_targets)): temp=diff_not_msn24_targets[i].mean()-diff_msn24_targets[i].mean() effect.append(temp) ci=sms.CompareMeans( sms.DescrStatsW(diff_not_msn24_targets[i].tolist()), sms.DescrStatsW(diff_msn24_targets[i].tolist())).tconfint_diff(usevar='unequal') temp=(ci[1]-ci[0])/float(2) se.append(temp) plt.figure(figsize=(20,10)) plt.axhline(y=0.0,color='r',linestyle='--') plt.errorbar(range(len(experiment)), effect,yerr=se,fmt='o') plt.xlabel('') plt.ylabel('Mean log2 Fold change',fontsize=25) plt.xticks(range(len(experiment)),experiment,rotation='vertical',fontsize=20) plt.yticks(fontsize=20) plt.tight_layout() plt.savefig('%s/msn24_ko_heat_shock_vs_steady_state.diff.pdf'%fig_dir)
# Construct a confidence interval for the difference of mean for SBP when RACE=1 and RACE=2. race_1_data = d1.loc[(d1['RACE'] == 1) & (pd.notnull(d1['RACE']))] race_2_data = d1.loc[(d1['RACE'] == 2) & (pd.notnull(d1['RACE']))] test_variables = ['AGE', 'SBP', 'DBP', 'WT', 'BMI', 'TC'] # if you want to see the summary of new dataset race_1_stats = race_1_data[test_variables].describe() race_2_stats = race_2_data[test_variables].describe() # Objective is to Construct a confidence interval for the difference of mean for SBP for both race # so we drop the missing value of SBP sbp_race_1_stats_obj = sms.DescrStatsW(race_1_data['SBP'].dropna()) sbp_race_2_stats_obj = sms.DescrStatsW(race_2_data['SBP'].dropna()) sbp_mean_comparison_obj = sms_api.CompareMeans(sbp_race_1_stats_obj, sbp_race_2_stats_obj) ci_for_diff_btw_mean = sbp_mean_comparison_obj.tconfint_diff() print(ci_for_diff_btw_mean) # Q7 # Construct a confidence interval for proportion of smokers. table = pd.crosstab(d9['SMOKE'], columns='count') print(table) import statsmodels.stats.proportion as one ci_low, ci_upp = one.proportion_confint(74, 1868, alpha=0.05, method='normal') print(ci_low, ci_upp) # Q8 # Also construct a confidence interval for difference of proportions for smokers when RACE=1 and RACE=2.
import pandas as pd import numpy as np import matplotlib.pyplot as plt import scipy.stats as stats from scipy.stats import ttest_ind import statsmodels.stats.api as sms GE = pd.read_csv('C:/Users/anivia/Desktop/geDJ.txt', sep="\s+", header=None, names=['date', 'open', 'high', 'low', 'close', 'vol']) SP = pd.read_csv( 'https://www.math.ust.hk/~macwyu/MAFS5110_2018-2019/MAFS5110_2018-2019/Chapter_1/sp500.txt', sep="\s+") logreturn_GE = np.diff(np.log(np.array(GE["close"]))) logreturn_sp500 = np.diff(np.log(np.array(SP["close"]))) da2 = pd.concat([pd.DataFrame(logreturn_GE), pd.DataFrame(logreturn_sp500)], axis=1) #da2.columns=['date','open','high','low','close','vol','logreturn_sp500'] #da2.index=da.index[1:] da2.columns = ["logreturn_GE", "logreturn_sp500"] da2.boxplot(column=['logreturn_GE', 'logreturn_sp500']) plt.show() print(stats.mood(logreturn_sp500, logreturn_GE)) print('H0 can be rejected, the variances are significantly different') print(ttest_ind(logreturn_sp500, logreturn_GE, equal_var=True)) print('') cm = sms.CompareMeans(sms.DescrStatsW(logreturn_sp500), sms.DescrStatsW(logreturn_GE)) print(cm.tconfint_diff())
plt.subplot(122) pyplot.scatter(y_test, rf.predict(X_test), color="red", alpha=0.1) pyplot.xlim(2,10) pyplot.ylim(2,10) plot(range(11), color='black') grid() pyplot.title('Test set', fontsize=16) pyplot.xlabel('Quality') pyplot.ylabel('Estimated quality') # The coefficient of determination for the random forest rf.score(X_test, y_test) # We compare the errors of the linear regression and random forest on a test sample plt.figure(figsize(8,6)) plt.hist(abs(y_test - lm.predict(X_test)) - abs(y_test - rf.predict(X_test)), bins=16, normed=True) plt.xlabel('Difference of absolute errors') # The differences between the average absolute errors are significant tmeans = sm.CompareMeans(sm.DescrStatsW(abs(y_test - lm.predict(X_test))), sm.DescrStatsW(abs(y_test - rf.predict(X_test)))) tmeans.ttest_ind(alternative='two-sided', usevar='pooled', value=0)[1] # 95% confidence interval for the average difference of absolute errors tmeans.tconfint_diff(alpha=0.05, alternative='two-sided', usevar='pooled') importances = pd.DataFrame(zip(X_train.columns, rf.feature_importances_)) importances.columns = ['feature name', 'importance'] importances.sort(ascending=False) # The alcohol content has the greatest influence on the expert evaluation of wine quality.
treatment_df["active_mins"].describe() control_df["active_mins"].describe() #note that the mean active_mins is higher in the dataframe that has the experimental group than the control group #conduct t-test stats.ttest_ind(treatment_df["active_mins"], control_df["active_mins"], equal_var=False) #output: t-statistic=30.686846737487123 and pvalue<.05) #now we're going to find the 95% confidene interval x1 = treatment_df["active_mins"] x2 = control_df["active_mins"] #going to use statsmodels cm = sms.CompareMeans(sms.DescrStatsW(x1), sms.DescrStatsW(x2)) print(cm.tconfint_diff(usevar='unequal')) #################################################################################### #PAGE 4 #read in the dataframes wrangled in R ctrl_df_pg4 = pd.read_csv("/Users/ankushbharadwaj/Desktop/ctrl_df_pg4.csv") exp_df_pg4 = pd.read_csv("/Users/ankushbharadwaj/Desktop/exp_df_pg4.csv") #STEP 1: REMOVE OUTLIERS #going to remove outliers more than 3 standard deviations from mean #get standard deviation of active minutes per user per day for each group std_exp = np.std(exp_df_pg4["active_mins"]) std_ctrl = np.std(ctrl_df_pg4["active_mins"])
def calc_student_ttest_result(a, b, confidence): result = {} # a = [1,3,5,17,9] # b = [12,4,6,8,10,41] result['group_1_N'] = len(a) result['group_2_N'] = len(b) if len(a) < 2 or len(b) < 2: result['group_1_mean'] = "-1"; result['group_1_std'] = "-1"; result['group_1_std_error']='-1'; result['group_2_mean'] = "-1"; result['group_2_std'] = "-1"; result['group_2_std_error']='-1'; result['group_unequal_low']="-1" result['group_unequal_up']="-1" result['group_equal_low']="-1" result['group_equal_up']="-1" result['group_equal_t']="-1" result['group_equal_p']="-1" result['group_equal_free_degree']="-1" result['group_unequal_t']="-1" result['group_unequal_p']="-1" result['group_unequal_free_degree']="-1" result['group_equal_mean_error']="-1" result['group_unequal_mean_error']="-1" result['F']="-1" result['sig']="-1" result['group_unequal_std_error']="-1" result['group_equal_std_error']="-1" return result mean1, _, stddev1, _, _, _ = statistics.stats(a, confidence) result['group_1_mean'] = utils.get_Decimal_float(mean1) result['group_1_std'] = utils.get_Decimal_float(stddev1) result['group_1_std_error'] = utils.get_Decimal_float(stddev1/math.sqrt(len(a))) mean2, _, stddev2, _, _, _ = statistics.stats(b, confidence) result['group_2_mean'] = utils.get_Decimal_float(mean2) result['group_2_std'] = utils.get_Decimal_float(stddev2) result['group_2_std_error'] = utils.get_Decimal_float(stddev2/math.sqrt(len(b))) import statsmodels.stats.api as sms cm = sms.CompareMeans(sms.DescrStatsW(a), sms.DescrStatsW(b)) tconfint_diff = cm.tconfint_diff(alpha=1.0 - confidence, usevar='unequal') result['group_unequal_low'] = utils.get_Decimal_float(tconfint_diff[0]) result['group_unequal_up'] = utils.get_Decimal_float(tconfint_diff[1]) tconfint_diff = cm.tconfint_diff(alpha=1.0 - confidence, usevar='pooled') result['group_equal_low'] = utils.get_Decimal_float(tconfint_diff[0]) result['group_equal_up'] = utils.get_Decimal_float(tconfint_diff[1]) import statsmodels.api as sm ttest_int_result = sm.stats.ttest_ind(a, b, usevar='pooled') result['group_equal_t'] = utils.get_Decimal_float(ttest_int_result[0]) result['group_equal_p'] = utils.get_Decimal_float(ttest_int_result[1]) result['group_equal_free_degree'] = Decimal(ttest_int_result[2]) ttest_int_result = sm.stats.ttest_ind(a, b, usevar='unequal') result['group_unequal_t'] = utils.get_Decimal_float(ttest_int_result[0]) result['group_unequal_p'] = utils.get_Decimal_float(ttest_int_result[1]) result['group_unequal_free_degree'] = utils.get_Decimal_float(ttest_int_result[2]) result['group_equal_mean_error'] = result['group_1_mean'] - result['group_2_mean'] result['group_unequal_mean_error'] = result['group_1_mean'] - result['group_2_mean'] from scipy.stats import levene ttest_levene = levene(a, b, center = 'trimmed') result['F'] = utils.get_Decimal_float(ttest_levene[0]) result['sig'] = utils.get_Decimal_float(ttest_levene[1]) result['group_unequal_std_error'] = utils.get_Decimal_float(math.sqrt(stddev1*stddev1/len(a) + stddev2*stddev2/len(b))) #error result['group_equal_std_error'] = utils.get_Decimal_float(math.sqrt(stddev1*stddev1/len(a) + stddev2*stddev2/len(b))) return result
def t_distribution_ci(df, metric='post_sales_temp', control='Control', test='Test_1', test_flag='test_flag', alpha=0.05): signi = [] p_value = [] test_data_A = df[df[test_flag] == control] test_data_B = df[df[test_flag] == test] test_data_A[metric] = test_data_A[metric].astype('float') test_data_B[metric] = test_data_B[metric].astype('float') print(test_data_A[metric].quantile(.995)) #test_data_A_clean = test_data_A[(test_data_A[metric]>0) & (test_data_A[metric]<test_data_A[metric].quantile(.995))] test_data_A_clean = test_data_A print(test_data_B[metric].quantile(.995)) #test_data_B_clean = test_data_B[(test_data_B[metric]>0) & (test_data_B[metric]<test_data_B[metric].quantile(.995))] test_data_B_clean = test_data_B #Combine the cleaned data sets as one test_data_clean = test_data_A_clean.append(test_data_B_clean) #Summarize the metrics:- Calculating totals test_summary1 = test_data_clean.groupby(test_flag).agg({metric: 'sum'}) #Summarize the metrics:- Calculating means test_summary2 = test_data_clean.groupby(test_flag).agg({metric: 'mean'}) #Transposing the summaries test_summary1 = test_summary1.T test_summary2 = test_summary2.T #Initialize a dataframe with test stats test_stats = pd.DataFrame( columns=['pct_lft', 'conf_int_lb', 'conf_int_ub', 'p-value']) #Concatenate the test stats with both the summaries test_summary1 = pd.concat([test_summary1, test_stats], axis=1, ignore_index=False, sort=False) #Calculate pct_lift for all the metrics test_summary1['pct_lft'] = (test_summary1[test] - test_summary1[control] ) / test_summary1[control] * 100 test_summary2 = pd.concat([test_summary2, test_stats], axis=1, ignore_index=False, sort=False) #Calculate pct_lift for all the metrics test_summary2['pct_lft'] = (test_summary2[test] - test_summary2[control] ) / test_summary2[control] * 100 cm = sms.CompareMeans( sms.DescrStatsW( test_data_A_clean[metric][test_data_A_clean[metric].notnull()]), sms.DescrStatsW( test_data_B_clean[metric][test_data_B_clean[metric].notnull()])) lb, rb = cm.tconfint_diff(usevar='unequal', alternative='two-sided', alpha=0.10) test_summary2['conf_int_lb'] = (rb * -1) / test_data_A_clean[metric].mean() test_summary2['conf_int_ub'] = (lb * -1) / test_data_A_clean[metric].mean() t_stat, test_summary2['p-value'] = sc.ttest_ind( test_data_A_clean[metric][test_data_A_clean[metric].notnull()], test_data_B_clean[metric][test_data_B_clean[metric].notnull()], equal_var=False) if (test_summary2['p-value'].iloc[0] < alpha) and (test_summary2['pct_lft'].iloc[0] > 0): signi.append('Significant with lift') elif (test_summary2['p-value'].iloc[0] < alpha) and (test_summary2['pct_lft'].iloc[0] < 0): signi.append('Significanct ,control performance better than test') elif (test_summary2['p-value'].iloc[0] > alpha) and (test_summary2['pct_lft'].iloc[0] < 0): signi.append('Not significanct with negative lift') elif (test_summary2['p-value'].iloc[0] > alpha) and (test_summary2['pct_lft'].iloc[0] > 0): signi.append('Not significant with positive lift') else: signi.append('Nothing') print(signi) test_summary2['sigificance'] = signi return test_summary2
# In[42]: data = pd.read_csv('C:/Users/USER/Desktop/test/day.csv') grouped1 = data['cnt'].groupby(data['season']) grouped2 = data['cnt'].groupby(data['weekday']) print(grouped1.mean(), grouped2.mean()) #data.iloc[np.r_[1:10,15:20,50:100]] #data.loc[0:10,['season','weekday']] # ### 2. 공휴일과 평일의 모든 사용자 수 평균이 차이가 있는 지를 통계적으로 검증하시 오. (9점) # # In[11]: cm = sms.CompareMeans(sms.DescrStatsW(data[data['holiday'] == 1].cnt), sms.DescrStatsW(data[data['holiday'] == 0].cnt)) print(cm.ttest_ind(usevar='pooled')) print(cm.tconfint_diff(usevar='pooled')) print('t 값이 0.5이하이고, p-value 또한 0.05 이상이고, 신뢰구간에 0이 포함되므로 두 평균에 차이는 없다.') # ### 3. 2011년 1월 1일부터 2012년 12월 31일까지의 일별 일반 사용자 수, 회원 사용 자 수, 모든 사용자 수를 보여주는 그래프를 그리시오. (6점) # In[5]: data = pd.read_csv('C:/Users/USER/Desktop/test/day.csv') data.index = pd.to_datetime(data['dteday']) data = data[['casual', 'registered', 'cnt']] data.plot() #plt.ylim((0,500)) plt.show()