def normality_test(self, var): # Проверка на нормальность распределения if self.num_var == "2": self.__scale_t = self.scale_type(var) if (self.__scale_t == 'порядковая') or (self.__scale_t == 'количественная'): SW = stats.shapiro(self.df[var]) if SW[1] > self.p_val: self.__norm = 1 print( "Тест Шапиро-Уилка на нормальность распределения: W = {:.6}, p = {:f}. Вывод: распределение нормально." .format(SW[0], SW[1])) print( 'Среднее: {:.4} и 95% доверительный интервал: [{:.4}, {:.4}]' .format(np.mean(self.df[var], axis=0), zconfint(self.df[var])[0], zconfint(self.df[var])[1])) else: self.__norm = 0 print( "Тест Шапиро-Уилка на нормальность распределения: W = {:.6}, p = {:f}. Вывод: распределение НЕ нормально." .format(SW[0], SW[1])) else: print( "Данные не имеют количественной природы. Проверка на нормальность не требуется." ) self.__norm = -1 return self.__norm
def conf_interval(field): """" Calculate confidence interval for given field """ # I've rounded numbers to integers because estimated values (likes, shares, ...) are integers themselves. print( "95% confidence interval for the EPH posts mean number of {:s}: ({z[0]:.0f}, {z[1]:.0f})" .format(field, z=zconfint(park[field]))) print( "95% confidence interval for the UCT posts mean number of {:s}: ({z[0]:.0f}, {z[1]:.0f})" .format(field, z=zconfint(town[field]))) print( "95% confidence interval for the FSZ posts mean number of {:s}: ({z[0]:.0f}, {z[1]:.0f})" .format(field, z=zconfint(free[field])))
def test_normality_test(self): with warnings.catch_warnings(): warnings.filterwarnings("ignore", category=PendingDeprecationWarning) warnings.filterwarnings("ignore", category=DeprecationWarning) self.EBM_work.df = pd.read_excel(self.EBM_work.file) var = "ВОЗРАСТ" self.EBM_work.scale_type(var) assert self.EBM_work.normality_test(var) == 0 # распределение не нормально assert round(stats.shapiro(self.EBM_work.df[var])[0], 4) == 0.9733 assert stats.shapiro(self.EBM_work.df[var])[1] < 0.05 var = "BF" self.EBM_work.scale_type(var) assert self.EBM_work.normality_test(var) == 0 # распределение не нормально assert round(stats.shapiro(self.EBM_work.df[var])[0], 4) == 0.7668 assert stats.shapiro(self.EBM_work.df[var])[1] < 0.05 var = "N1" self.EBM_work.scale_type(var) assert self.EBM_work.normality_test(var) == 1 # распределение нормально assert round(stats.shapiro(self.EBM_work.df[var])[0], 4) == 0.9970 assert stats.shapiro(self.EBM_work.df[var])[1] > 0.05 assert round(np.mean(self.EBM_work.df[var], axis=0), 2) == 0.02 assert round(np.std(self.EBM_work.df[var], axis=0), 2) == 2.18 assert round(zconfint(self.EBM_work.df[var])[0], 4) == -0.1875 assert round(zconfint(self.EBM_work.df[var])[1], 4) == 0.2353 var = "N2" self.EBM_work.scale_type(var) assert self.EBM_work.normality_test(var) == 1 # распределение нормально assert stats.shapiro(self.EBM_work.df[var])[1] > 0.05 var = "ФО" self.EBM_work.scale_type(var) assert self.EBM_work.normality_test(var) == -1 # Проверка на нормальность не требуется var = "Регион" self.EBM_work.scale_type(var) assert self.EBM_work.normality_test(var) == -1 # Проверка на нормальность не требуется
def z_confint(sample): """Доверительный интервал (95) основанный на нормальном распределении Parameters ---------- sample : array_like Массив наблюдений Returns ------- lower, upper : floats Левая и правая граница доверительного интервала """ return zconfint(sample)
def test_ztost(): xfair = np.repeat([1, 0], [228, 762 - 228]) # comparing to SAS last output at # http://support.sas.com/documentation/cdl/en/procstat/63104/HTML/default/viewer.htm#procstat_freq_sect028.htm # confidence interval for tost # generic ztost is moved to weightstats from statsmodels.stats.weightstats import zconfint, ztost ci01 = zconfint(xfair, alpha=0.1, ddof=0) assert_almost_equal(ci01, [0.2719, 0.3265], 4) res = ztost(xfair, 0.18, 0.38, ddof=0) assert_almost_equal(res[1][0], 7.1865, 4) assert_almost_equal(res[2][0], -4.8701, 4) assert_array_less(res[0], 0.0001)
def test_ztost(): xfair = np.repeat([1,0], [228, 762-228]) # comparing to SAS last output at # http://support.sas.com/documentation/cdl/en/procstat/63104/HTML/default/viewer.htm#procstat_freq_sect028.htm # confidence interval for tost # generic ztost is moved to weightstats from statsmodels.stats.weightstats import zconfint, ztost ci01 = zconfint(xfair, alpha=0.1, ddof=0) assert_almost_equal(ci01, [0.2719, 0.3265], 4) res = ztost(xfair, 0.18, 0.38, ddof=0) assert_almost_equal(res[1][0], 7.1865, 4) assert_almost_equal(res[2][0], -4.8701, 4) assert_array_less(res[0], 0.0001)
def test(self): x1, x2 = self.x1, self.x2 cm = self.cm # tc : test cases for tc in [ ztest_, ztest_smaller, ztest_larger, ztest_mu, ztest_smaller_mu, ztest_larger_mu ]: zstat, pval = ztest(x1, x2, value=tc.null_value, alternative=alternatives[tc.alternative]) assert_allclose(zstat, tc.statistic, rtol=1e-10) assert_allclose(pval, tc.p_value, rtol=1e-10, atol=1e-16) zstat, pval = cm.ztest_ind( value=tc.null_value, alternative=alternatives[tc.alternative]) assert_allclose(zstat, tc.statistic, rtol=1e-10) assert_allclose(pval, tc.p_value, rtol=1e-10, atol=1e-16) #overwrite nan in R's confint tc_conf_int = tc.conf_int.copy() if np.isnan(tc_conf_int[0]): tc_conf_int[0] = -np.inf if np.isnan(tc_conf_int[1]): tc_conf_int[1] = np.inf # Note: value is shifting our confidence interval in zconfint ci = zconfint(x1, x2, value=0, alternative=alternatives[tc.alternative]) assert_allclose(ci, tc_conf_int, rtol=1e-10) ci = cm.zconfint_diff(alternative=alternatives[tc.alternative]) assert_allclose(ci, tc_conf_int, rtol=1e-10) ci = zconfint(x1, x2, value=tc.null_value, alternative=alternatives[tc.alternative]) assert_allclose(ci, tc_conf_int - tc.null_value, rtol=1e-10) # 1 sample test copy-paste d1 = self.d1 for tc in [ztest_mu_1s, ztest_smaller_mu_1s, ztest_larger_mu_1s]: zstat, pval = ztest(x1, value=tc.null_value, alternative=alternatives[tc.alternative]) assert_allclose(zstat, tc.statistic, rtol=1e-10) assert_allclose(pval, tc.p_value, rtol=1e-10, atol=1e-16) zstat, pval = d1.ztest_mean( value=tc.null_value, alternative=alternatives[tc.alternative]) assert_allclose(zstat, tc.statistic, rtol=1e-10) assert_allclose(pval, tc.p_value, rtol=1e-10, atol=1e-16) #overwrite nan in R's confint tc_conf_int = tc.conf_int.copy() if np.isnan(tc_conf_int[0]): tc_conf_int[0] = -np.inf if np.isnan(tc_conf_int[1]): tc_conf_int[1] = np.inf # Note: value is shifting our confidence interval in zconfint ci = zconfint(x1, value=0, alternative=alternatives[tc.alternative]) assert_allclose(ci, tc_conf_int, rtol=1e-10) ci = d1.zconfint_mean(alternative=alternatives[tc.alternative]) assert_allclose(ci, tc_conf_int, rtol=1e-10)
#程序文件Pex4_14_3.py import numpy as np from statsmodels.stats.weightstats import zconfint from scipy import stats a = np.array([ 506, 508, 499, 503, 504, 510, 497, 512, 514, 505, 493, 496, 506, 502, 509, 496 ]) ci = zconfint(a) print("置信区间为:", ci)
def test(self): x1, x2 = self.x1, self.x2 cm = self.cm # tc : test cases for tc in [ztest_, ztest_smaller, ztest_larger, ztest_mu, ztest_smaller_mu, ztest_larger_mu]: zstat, pval = ztest(x1, x2, value=tc.null_value, alternative=alternatives[tc.alternative]) assert_allclose(zstat, tc.statistic, rtol=1e-10) assert_allclose(pval, tc.p_value, rtol=1e-10, atol=1e-16) zstat, pval = cm.ztest_ind(value=tc.null_value, alternative=alternatives[tc.alternative]) assert_allclose(zstat, tc.statistic, rtol=1e-10) assert_allclose(pval, tc.p_value, rtol=1e-10, atol=1e-16) # overwrite nan in R's confint tc_conf_int = tc.conf_int.copy() if np.isnan(tc_conf_int[0]): tc_conf_int[0] = - np.inf if np.isnan(tc_conf_int[1]): tc_conf_int[1] = np.inf # Note: value is shifting our confidence interval in zconfint ci = zconfint(x1, x2, value=0, alternative=alternatives[tc.alternative]) assert_allclose(ci, tc_conf_int, rtol=1e-10) ci = cm.zconfint_diff(alternative=alternatives[tc.alternative]) assert_allclose(ci, tc_conf_int, rtol=1e-10) ci = zconfint(x1, x2, value=tc.null_value, alternative=alternatives[tc.alternative]) assert_allclose(ci, tc_conf_int - tc.null_value, rtol=1e-10) # 1 sample test copy-paste d1 = self.d1 for tc in [ztest_mu_1s, ztest_smaller_mu_1s, ztest_larger_mu_1s]: zstat, pval = ztest(x1, value=tc.null_value, alternative=alternatives[tc.alternative]) assert_allclose(zstat, tc.statistic, rtol=1e-10) assert_allclose(pval, tc.p_value, rtol=1e-10, atol=1e-16) zstat, pval = d1.ztest_mean(value=tc.null_value, alternative=alternatives[tc.alternative]) assert_allclose(zstat, tc.statistic, rtol=1e-10) assert_allclose(pval, tc.p_value, rtol=1e-10, atol=1e-16) # overwrite nan in R's confint tc_conf_int = tc.conf_int.copy() if np.isnan(tc_conf_int[0]): tc_conf_int[0] = - np.inf if np.isnan(tc_conf_int[1]): tc_conf_int[1] = np.inf # Note: value is shifting our confidence interval in zconfint ci = zconfint(x1, value=0, alternative=alternatives[tc.alternative]) assert_allclose(ci, tc_conf_int, rtol=1e-10) ci = d1.zconfint_mean(alternative=alternatives[tc.alternative]) assert_allclose(ci, tc_conf_int, rtol=1e-10)
print("Média dos filmes com pelo menos 10 votos", nota_media_dos_filmes_com_pelo_menos_10_votos.mean()) import matplotlib.pyplot as plt import numpy as np np.random.seed(75243) temp = nota_media_dos_filmes_com_pelo_menos_10_votos.sample(frac=1) medias = [temp[0:i].mean() for i in range(1, len(temp))] plt.plot(medias) from statsmodels.stats.weightstats import zconfint zconfint(nota_media_dos_filmes_com_pelo_menos_10_votos) from statsmodels.stats.weightstats import DescrStatsW descr_todos_com_10_votos = DescrStatsW(nota_media_dos_filmes_com_pelo_menos_10_votos) descr_todos_com_10_votos.tconfint_mean() """# Vamos ver o filme 1...""" filmes = pd.read_csv("movies.csv") filmes.query("movieId==1") notas1 = notas.query("movieId == 1") notas1.head() ax = sns.distplot(notas1.rating)
sumup_weak.append(tmin[(ids == i) & (scales_all == s) & (tmin <= -50)]) for i in strong: # print(scales_all[(ids == i)]) sumup_strong.append(tmin[(ids == i) & (scales_all == s) & (tmin <= -50)]) for i in uids: # print(scales_all[(ids == i)]) sumup_mean.append(tmin[(ids == i) & (scales_all == s) & (tmin <= -50)]) weak_scales.append(np.nanmean(np.concatenate(sumup_weak))) strong_scales.append(np.nanmean(np.concatenate(sumup_strong))) mean_scales.append(np.nanmean(np.concatenate(sumup_mean))) wupper.append(stats.zconfint(np.concatenate(sumup_weak))[1]) wlower.append(stats.zconfint(np.concatenate(sumup_weak))[0]) supper.append(stats.zconfint(np.concatenate(sumup_strong))[1]) slower.append(stats.zconfint(np.concatenate(sumup_strong))[0]) f = plt.figure() plt.plot(uscales, np.array(weak_scales), label='Lowest Probability') #plt.fill_between(uscales, wlower, wupper, alpha=0.3) #plt.errorbar(uscales, weak_scales, xerr = w_std*2) plt.plot(uscales, np.array(strong_scales), color='r', label='Highest probability') #plt.fill_between(uscales, slower, supper, color='r', alpha=0.3) #plt.plot(uscales, mean_scales, color='g', label = 'Average distribution') plt.xlabel('Scales (km)')
"""---------------------------------------------------------------------------- Z Test Pressupõe normalidade """ diagnostico_m = df.query("diagnosis == 'M'") diagnostico_b = df.query("diagnosis == 'B'") # Efetuando o Zteste para a média (Comparando os resultados) ztest(diagnostico_m['mean_radius'], value = diagnostico_m['mean_radius'].mean()) ztest(diagnostico_m['mean_radius'], value = diagnostico_b['mean_radius'].mean()) # Gerando o intervalo de confiança zconfint(diagnostico_m['mean_radius']) zconfint(diagnostico_b['mean_radius']) """---------------------------------------------------------------------------- T Test """ diagnostico_m = df.query("diagnosis == 'M'") diagnostico_b = df.query("diagnosis == 'B'") # Aplicando o teste resultados_m = DescrStatsW(diagnostico_m['mean_radius']) resultados_b = DescrStatsW(diagnostico_b['mean_radius']) # Gerando o intervalo de confiança resultados_m.tconfint_mean()
print(f'p-value Aventura: {p_av}') if p_av < p_v: #Vamos testar a hipotese nula de que as distribuições são normais. print( "Rejeitamos a Hipotese nula para p_av, portanto não é uma distribuição normal" ) else: print("É uma distribuição normal") # In[101]: # solucao teste de normalidade ou justificativa para nao utiliza-lo from scipy.stats import ranksums from statsmodels.stats.weightstats import zconfint print("Intervalo de confiança para Horror = ", zconfint(votos_por_genero_por_filme.query("Horror > 0").Horror)) print("Intervalo de confiança para Aventura = ", zconfint(votos_por_genero_por_filme.query("Adventure > 0").Adventure)) # In[102]: from statsmodels.stats.weightstats import ztest notas_horror = votos_por_genero_por_filme.query("Horror > 0").Horror notas_aventura = votos_por_genero_por_filme.query("Adventure > 0").Adventure # solução com o teste desejado print(ranksums(notas_horror, notas_aventura)) # ### Solução (explique sua conclusão): #
sumup_weak.append(tmin[(ids == i) & (scales_all == s) & (tmin <=-50)]) for i in strong: # print(scales_all[(ids == i)]) sumup_strong.append( tmin[(ids == i) & (scales_all == s) & (tmin <=-50)]) for i in uids: # print(scales_all[(ids == i)]) sumup_mean.append( tmin[(ids == i) & (scales_all == s) & (tmin <=-50)]) weak_scales.append(np.nanmean(np.concatenate(sumup_weak))) strong_scales.append(np.nanmean(np.concatenate(sumup_strong))) mean_scales.append(np.nanmean(np.concatenate(sumup_mean))) wupper.append(stats.zconfint(np.concatenate(sumup_weak))[1]) wlower.append(stats.zconfint(np.concatenate(sumup_weak))[0]) supper.append(stats.zconfint(np.concatenate(sumup_strong))[1]) slower.append(stats.zconfint(np.concatenate(sumup_strong))[0]) f = plt.figure() plt.plot(uscales, np.array(weak_scales), label = 'Lowest Probability') #plt.fill_between(uscales, wlower, wupper, alpha=0.3) #plt.errorbar(uscales, weak_scales, xerr = w_std*2) plt.plot(uscales, np.array(strong_scales), color='r', label = 'Highest probability') #plt.fill_between(uscales, slower, supper, color='r', alpha=0.3) #plt.plot(uscales, mean_scales, color='g', label = 'Average distribution') plt.xlabel('Scales (km)') plt.ylabel('Tmean(power max)')
# Давайте вернёмся к данным выживаемости пациентов с лейкоцитарной лимфомой из видео про критерий знаков: # Измерено остаточное время жизни с момента начала наблюдения (в неделях); # звёздочка обозначает цензурирование сверху — исследование длилось 7 лет, и остаточное время жизни одного пациента, который дожил до конца наблюдения, неизвестно. #%% import numpy as np from scipy import stats from statsmodels.stats.weightstats import zconfint life_times = np.array([49, 58, 75, 110, 112, 132, 151, 276, 281, 362]) #∗ print "95%% confidence interval for the life time: [%f, %f]" % zconfint( life_times) # Поскольку цензурировано только одно наблюдение, для проверки гипотезы H0:medX=200 на этих данных можно использовать критерий знаковых рангов — можно считать, # что время дожития последнего пациента в точности равно 362, на ранг этого наблюдения это никак не повлияет. # Критерием знаковых рангов проверьте эту гипотезу против двусторонней альтернативы, выведите достигаемый уровень значимости, # округлённый до четырёх знаков после десятичной точки. #%% medX = 200 print "Wilcoxon criterion pvalue result: %.4f" % np.round( stats.wilcoxon(life_times - medX).pvalue, 4) #%% # В ходе исследования влияния лесозаготовки на биоразнообразие лесов острова Борнео собраны данные о количестве видов деревьев в 12 лесах, где вырубка не ведётся: no_cut_kinds = np.array([22, 22, 15, 13, 19, 19, 18, 20, 21, 13, 13, 15]) # и в 9 лесах, где идёт вырубка: cut_kinds = np.array([17, 18, 18, 15, 12, 4, 14, 15, 10]) # Проверьте гипотезу о равенстве среднего количества видов в двух типах лесов против односторонней альтернативы о снижении биоразнообразия в вырубаемых лесах. # Используйте ранговый критерий. Чему равен достигаемый уровень значимости? Округлите до четырёх знаков после десятичной точки. #%% print "Mann-Whitney criterion pvalue result: %.4f" % np.round( stats.mannwhitneyu(no_cut_kinds, cut_kinds, alternative="greater").pvalue, 4)
# Доверительный z-интервал from statsmodels.stats.weightstats import zconfint x = [1, 2, 1, 1, 1] zint = zconfint(x, alpha=0.05, alternative='two-sided', ddof=1.0) # (0.8080072030919891, 1.5919927969080108) # Доверительный t-интервал для среднего from statsmodels.stats.weightstats import _tconfint_generic x = [1, 2, 0, 3, 1, 1, 2, 4, 5, 6] n = len(x) mean = x.mean() sigma = x.std(ddof=1)/math.sqrt(n) _tconfint_generic(mean, sigma, n-1, 0.05, 'two-sided') # (1.0994, 3.9006)) - 95% доверительный интервал для среднего # Доверительный интервал для доли from statsmodels.stats.proportion import proportion_confint normal_interval = proportion_confint(n_positive, n_all, alpha=0.05 method = 'normal') # 95% confident interval # Размер выборки для интервала заданной ширины from statsmodels.stats.proportion import samplesize_confint_proportion n_samples = samplesize_confint_proportion(random_sample.mean(), half_length=0.01, alpha=0.05) # 95% confident interval
df.ftypes # Descrição dos dados df.describe() """---------------------------------------------------------------------------- TESTE Z Hzero = As medias sao iguais (mu_1 = mu_2) """ diagnostico_m = df.query("diagnosis == 'M'") diagnostico_b = df.query("diagnosis == 'B'") # Efetuando o Zteste para a comparacao entre as médias ztest(diagnostico_m["mean_radius"], diagnostico_b["mean_radius"]) # Calculo do intervalo de confianca para a diferenca entre as medis zconfint(diagnostico_m["mean_radius"], diagnostico_b["mean_radius"]) """---------------------------------------------------------------------------- TESTE T Hzero = As medias sao iguais (mu_1 = mu_2) """ diagnostico_m = df.query("diagnosis == 'M'") diagnostico_b = df.query("diagnosis == 'B'") # Efetuando o Zteste para a comparacao entre as médias ztest(diagnostico_m["mean_radius"], diagnostico_b["mean_radius"]) # Calculo do intervalo de confianca para a diferenca entre as medis descr_stats_m = DescrStatsW(diagnostico_m["mean_radius"]) descr_stats_b = DescrStatsW(diagnostico_b["mean_radius"]) resultado = descr_stats_m.get_compare(descr_stats_b) print(resultado.summary(use_t=True))
kde_kws={'cumulative': True} ) ax.set(xlabel="Tempo de duração", ylabel="% de filmes") ax.set_title("Tempo de duração em filmes no TMDB 5000") tmdb.query("runtime>0").runtime.dropna().quantile(0.8) """# Movielens: média dos filmes com pelo menos 10 votos""" print("Média dos filmes com pelo menos 10 votos", nota_media_dos_filmes_com_pelo_menos_10_votos.mean()) import matplotlib.pyplot as plt import numpy as np np.random.seed(75243) temp = nota_media_dos_filmes_com_pelo_menos_10_votos.sample(frac=1) medias = [temp[0:i].mean() for i in range(1, len(temp))] plt.plot(medias) from statsmodels.stats.weightstats import zconfint zconfint(nota_media_dos_filmes_com_pelo_menos_10_votos) from statsmodels.stats.weightstats import DescrStatsW descr_todos_com_10_votos = DescrStatsW(nota_media_dos_filmes_com_pelo_menos_10_votos) descr_todos_com_10_votos.tconfint_mean()