예제 #1
0
    def normality_test(self, var):  # Проверка на нормальность распределения
        if self.num_var == "2":
            self.__scale_t = self.scale_type(var)

        if (self.__scale_t == 'порядковая') or (self.__scale_t
                                                == 'количественная'):
            SW = stats.shapiro(self.df[var])
            if SW[1] > self.p_val:
                self.__norm = 1
                print(
                    "Тест Шапиро-Уилка на нормальность распределения: W = {:.6}, p = {:f}. Вывод: распределение нормально."
                    .format(SW[0], SW[1]))
                print(
                    'Среднее: {:.4} и 95% доверительный интервал: [{:.4}, {:.4}]'
                    .format(np.mean(self.df[var], axis=0),
                            zconfint(self.df[var])[0],
                            zconfint(self.df[var])[1]))
            else:
                self.__norm = 0
                print(
                    "Тест Шапиро-Уилка на нормальность распределения: W = {:.6}, p = {:f}. Вывод: распределение НЕ нормально."
                    .format(SW[0], SW[1]))
        else:
            print(
                "Данные не имеют количественной природы. Проверка на нормальность не требуется."
            )
            self.__norm = -1
        return self.__norm
예제 #2
0
def conf_interval(field):
    """"
    Calculate confidence interval for given field
    """
    # I've rounded numbers to integers because estimated values (likes, shares, ...) are integers themselves.
    print(
        "95% confidence interval for the EPH posts mean number of {:s}: ({z[0]:.0f}, {z[1]:.0f})"
        .format(field, z=zconfint(park[field])))
    print(
        "95% confidence interval for the UCT posts mean number of {:s}: ({z[0]:.0f}, {z[1]:.0f})"
        .format(field, z=zconfint(town[field])))
    print(
        "95% confidence interval for the FSZ posts mean number of {:s}: ({z[0]:.0f}, {z[1]:.0f})"
        .format(field, z=zconfint(free[field])))
예제 #3
0
    def test_normality_test(self):
        with warnings.catch_warnings():
            warnings.filterwarnings("ignore", category=PendingDeprecationWarning)
            warnings.filterwarnings("ignore", category=DeprecationWarning)
            self.EBM_work.df = pd.read_excel(self.EBM_work.file)
            var = "ВОЗРАСТ"
            self.EBM_work.scale_type(var)
            assert self.EBM_work.normality_test(var) == 0 # распределение не нормально
            assert round(stats.shapiro(self.EBM_work.df[var])[0], 4) == 0.9733
            assert stats.shapiro(self.EBM_work.df[var])[1] < 0.05

            var = "BF"
            self.EBM_work.scale_type(var)
            assert self.EBM_work.normality_test(var) == 0 # распределение не нормально
            assert round(stats.shapiro(self.EBM_work.df[var])[0], 4) == 0.7668
            assert stats.shapiro(self.EBM_work.df[var])[1] < 0.05

            var = "N1"
            self.EBM_work.scale_type(var)
            assert self.EBM_work.normality_test(var) == 1 # распределение нормально
            assert round(stats.shapiro(self.EBM_work.df[var])[0], 4) == 0.9970
            assert stats.shapiro(self.EBM_work.df[var])[1] > 0.05
            assert round(np.mean(self.EBM_work.df[var], axis=0), 2) == 0.02
            assert round(np.std(self.EBM_work.df[var], axis=0), 2) == 2.18
            assert round(zconfint(self.EBM_work.df[var])[0], 4) == -0.1875
            assert round(zconfint(self.EBM_work.df[var])[1], 4) == 0.2353

            var = "N2"
            self.EBM_work.scale_type(var)
            assert self.EBM_work.normality_test(var) == 1 # распределение нормально
            assert stats.shapiro(self.EBM_work.df[var])[1] > 0.05

            var = "ФО"
            self.EBM_work.scale_type(var)
            assert self.EBM_work.normality_test(var) == -1 # Проверка на нормальность не требуется

            var = "Регион"
            self.EBM_work.scale_type(var)
            assert self.EBM_work.normality_test(var) == -1 # Проверка на нормальность не требуется
예제 #4
0
def z_confint(sample):
    """Доверительный интервал (95) основанный на нормальном распределении

    Parameters
    ----------
    sample : array_like
        Массив наблюдений

    Returns
    -------
    lower, upper : floats
        Левая и правая граница доверительного интервала
    """
    return zconfint(sample)
예제 #5
0
def test_ztost():
    xfair = np.repeat([1, 0], [228, 762 - 228])

    # comparing to SAS last output at
    # http://support.sas.com/documentation/cdl/en/procstat/63104/HTML/default/viewer.htm#procstat_freq_sect028.htm
    # confidence interval for tost
    # generic ztost is moved to weightstats
    from statsmodels.stats.weightstats import zconfint, ztost
    ci01 = zconfint(xfair, alpha=0.1, ddof=0)
    assert_almost_equal(ci01, [0.2719, 0.3265], 4)
    res = ztost(xfair, 0.18, 0.38, ddof=0)

    assert_almost_equal(res[1][0], 7.1865, 4)
    assert_almost_equal(res[2][0], -4.8701, 4)
    assert_array_less(res[0], 0.0001)
예제 #6
0
def test_ztost():
    xfair = np.repeat([1,0], [228, 762-228])

    # comparing to SAS last output at
    # http://support.sas.com/documentation/cdl/en/procstat/63104/HTML/default/viewer.htm#procstat_freq_sect028.htm
    # confidence interval for tost
    # generic ztost is moved to weightstats
    from statsmodels.stats.weightstats import zconfint, ztost
    ci01 = zconfint(xfair, alpha=0.1, ddof=0)
    assert_almost_equal(ci01,  [0.2719, 0.3265], 4)
    res = ztost(xfair, 0.18, 0.38, ddof=0)

    assert_almost_equal(res[1][0], 7.1865, 4)
    assert_almost_equal(res[2][0], -4.8701, 4)
    assert_array_less(res[0], 0.0001)
예제 #7
0
    def test(self):
        x1, x2 = self.x1, self.x2
        cm = self.cm

        # tc : test cases
        for tc in [
                ztest_, ztest_smaller, ztest_larger, ztest_mu,
                ztest_smaller_mu, ztest_larger_mu
        ]:

            zstat, pval = ztest(x1,
                                x2,
                                value=tc.null_value,
                                alternative=alternatives[tc.alternative])
            assert_allclose(zstat, tc.statistic, rtol=1e-10)
            assert_allclose(pval, tc.p_value, rtol=1e-10, atol=1e-16)

            zstat, pval = cm.ztest_ind(
                value=tc.null_value, alternative=alternatives[tc.alternative])
            assert_allclose(zstat, tc.statistic, rtol=1e-10)
            assert_allclose(pval, tc.p_value, rtol=1e-10, atol=1e-16)

            #overwrite nan in R's confint
            tc_conf_int = tc.conf_int.copy()
            if np.isnan(tc_conf_int[0]):
                tc_conf_int[0] = -np.inf
            if np.isnan(tc_conf_int[1]):
                tc_conf_int[1] = np.inf

            # Note: value is shifting our confidence interval in zconfint
            ci = zconfint(x1,
                          x2,
                          value=0,
                          alternative=alternatives[tc.alternative])
            assert_allclose(ci, tc_conf_int, rtol=1e-10)

            ci = cm.zconfint_diff(alternative=alternatives[tc.alternative])
            assert_allclose(ci, tc_conf_int, rtol=1e-10)

            ci = zconfint(x1,
                          x2,
                          value=tc.null_value,
                          alternative=alternatives[tc.alternative])
            assert_allclose(ci, tc_conf_int - tc.null_value, rtol=1e-10)

        # 1 sample test copy-paste
        d1 = self.d1
        for tc in [ztest_mu_1s, ztest_smaller_mu_1s, ztest_larger_mu_1s]:
            zstat, pval = ztest(x1,
                                value=tc.null_value,
                                alternative=alternatives[tc.alternative])
            assert_allclose(zstat, tc.statistic, rtol=1e-10)
            assert_allclose(pval, tc.p_value, rtol=1e-10, atol=1e-16)

            zstat, pval = d1.ztest_mean(
                value=tc.null_value, alternative=alternatives[tc.alternative])
            assert_allclose(zstat, tc.statistic, rtol=1e-10)
            assert_allclose(pval, tc.p_value, rtol=1e-10, atol=1e-16)

            #overwrite nan in R's confint
            tc_conf_int = tc.conf_int.copy()
            if np.isnan(tc_conf_int[0]):
                tc_conf_int[0] = -np.inf
            if np.isnan(tc_conf_int[1]):
                tc_conf_int[1] = np.inf

            # Note: value is shifting our confidence interval in zconfint
            ci = zconfint(x1,
                          value=0,
                          alternative=alternatives[tc.alternative])
            assert_allclose(ci, tc_conf_int, rtol=1e-10)

            ci = d1.zconfint_mean(alternative=alternatives[tc.alternative])
            assert_allclose(ci, tc_conf_int, rtol=1e-10)
예제 #8
0
#程序文件Pex4_14_3.py
import numpy as np
from statsmodels.stats.weightstats import zconfint
from scipy import stats

a = np.array([
    506, 508, 499, 503, 504, 510, 497, 512, 514, 505, 493, 496, 506, 502, 509,
    496
])
ci = zconfint(a)
print("置信区间为:", ci)
예제 #9
0
    def test(self):
        x1, x2 = self.x1, self.x2
        cm = self.cm

        # tc : test cases
        for tc in [ztest_, ztest_smaller, ztest_larger,
                   ztest_mu, ztest_smaller_mu, ztest_larger_mu]:

            zstat, pval = ztest(x1, x2, value=tc.null_value,
                                alternative=alternatives[tc.alternative])
            assert_allclose(zstat, tc.statistic, rtol=1e-10)
            assert_allclose(pval, tc.p_value, rtol=1e-10, atol=1e-16)

            zstat, pval = cm.ztest_ind(value=tc.null_value,
                                alternative=alternatives[tc.alternative])
            assert_allclose(zstat, tc.statistic, rtol=1e-10)
            assert_allclose(pval, tc.p_value, rtol=1e-10, atol=1e-16)

            # overwrite nan in R's confint
            tc_conf_int = tc.conf_int.copy()
            if np.isnan(tc_conf_int[0]):
                tc_conf_int[0] = - np.inf
            if np.isnan(tc_conf_int[1]):
                tc_conf_int[1] = np.inf

            # Note: value is shifting our confidence interval in zconfint
            ci = zconfint(x1, x2, value=0,
                          alternative=alternatives[tc.alternative])
            assert_allclose(ci, tc_conf_int, rtol=1e-10)

            ci = cm.zconfint_diff(alternative=alternatives[tc.alternative])
            assert_allclose(ci, tc_conf_int, rtol=1e-10)

            ci = zconfint(x1, x2, value=tc.null_value,
                          alternative=alternatives[tc.alternative])
            assert_allclose(ci, tc_conf_int - tc.null_value, rtol=1e-10)

        # 1 sample test copy-paste
        d1 = self.d1
        for tc in [ztest_mu_1s, ztest_smaller_mu_1s, ztest_larger_mu_1s]:
            zstat, pval = ztest(x1, value=tc.null_value,
                                alternative=alternatives[tc.alternative])
            assert_allclose(zstat, tc.statistic, rtol=1e-10)
            assert_allclose(pval, tc.p_value, rtol=1e-10, atol=1e-16)

            zstat, pval = d1.ztest_mean(value=tc.null_value,
                                 alternative=alternatives[tc.alternative])
            assert_allclose(zstat, tc.statistic, rtol=1e-10)
            assert_allclose(pval, tc.p_value, rtol=1e-10, atol=1e-16)

            # overwrite nan in R's confint
            tc_conf_int = tc.conf_int.copy()
            if np.isnan(tc_conf_int[0]):
                tc_conf_int[0] = - np.inf
            if np.isnan(tc_conf_int[1]):
                tc_conf_int[1] = np.inf

            # Note: value is shifting our confidence interval in zconfint
            ci = zconfint(x1, value=0,
                          alternative=alternatives[tc.alternative])
            assert_allclose(ci, tc_conf_int, rtol=1e-10)

            ci = d1.zconfint_mean(alternative=alternatives[tc.alternative])
            assert_allclose(ci, tc_conf_int, rtol=1e-10)
예제 #10
0
print("Média dos filmes com pelo menos 10 votos", nota_media_dos_filmes_com_pelo_menos_10_votos.mean())

import matplotlib.pyplot as plt
import numpy as np

np.random.seed(75243)
temp = nota_media_dos_filmes_com_pelo_menos_10_votos.sample(frac=1)

medias = [temp[0:i].mean() for i in range(1, len(temp))]

plt.plot(medias)

from statsmodels.stats.weightstats import zconfint

zconfint(nota_media_dos_filmes_com_pelo_menos_10_votos)

from statsmodels.stats.weightstats import DescrStatsW

descr_todos_com_10_votos = DescrStatsW(nota_media_dos_filmes_com_pelo_menos_10_votos)
descr_todos_com_10_votos.tconfint_mean()

"""# Vamos ver o filme 1..."""

filmes = pd.read_csv("movies.csv")
filmes.query("movieId==1")

notas1 = notas.query("movieId == 1")
notas1.head()

ax = sns.distplot(notas1.rating)
예제 #11
0
        sumup_weak.append(tmin[(ids == i) & (scales_all == s) & (tmin <= -50)])

    for i in strong:
        # print(scales_all[(ids == i)])
        sumup_strong.append(tmin[(ids == i) & (scales_all == s) &
                                 (tmin <= -50)])

    for i in uids:
        # print(scales_all[(ids == i)])
        sumup_mean.append(tmin[(ids == i) & (scales_all == s) & (tmin <= -50)])

    weak_scales.append(np.nanmean(np.concatenate(sumup_weak)))
    strong_scales.append(np.nanmean(np.concatenate(sumup_strong)))
    mean_scales.append(np.nanmean(np.concatenate(sumup_mean)))

    wupper.append(stats.zconfint(np.concatenate(sumup_weak))[1])
    wlower.append(stats.zconfint(np.concatenate(sumup_weak))[0])
    supper.append(stats.zconfint(np.concatenate(sumup_strong))[1])
    slower.append(stats.zconfint(np.concatenate(sumup_strong))[0])

f = plt.figure()
plt.plot(uscales, np.array(weak_scales), label='Lowest Probability')
#plt.fill_between(uscales, wlower, wupper, alpha=0.3)
#plt.errorbar(uscales, weak_scales, xerr = w_std*2)
plt.plot(uscales,
         np.array(strong_scales),
         color='r',
         label='Highest probability')
#plt.fill_between(uscales, slower, supper, color='r', alpha=0.3)
#plt.plot(uscales, mean_scales, color='g', label = 'Average distribution')
plt.xlabel('Scales (km)')
"""----------------------------------------------------------------------------
    Z Test
Pressupõe normalidade

"""
diagnostico_m = df.query("diagnosis == 'M'")
diagnostico_b = df.query("diagnosis == 'B'")


# Efetuando o Zteste para a média (Comparando os resultados)
ztest(diagnostico_m['mean_radius'], value = diagnostico_m['mean_radius'].mean())
ztest(diagnostico_m['mean_radius'], value = diagnostico_b['mean_radius'].mean())


# Gerando o intervalo de confiança
zconfint(diagnostico_m['mean_radius'])
zconfint(diagnostico_b['mean_radius'])


"""----------------------------------------------------------------------------
        T Test
"""
diagnostico_m = df.query("diagnosis == 'M'")
diagnostico_b = df.query("diagnosis == 'B'")

# Aplicando o teste
resultados_m = DescrStatsW(diagnostico_m['mean_radius'])
resultados_b = DescrStatsW(diagnostico_b['mean_radius'])

# Gerando o intervalo de confiança
resultados_m.tconfint_mean()
예제 #13
0
print(f'p-value Aventura: {p_av}')
if p_av < p_v:  #Vamos testar a hipotese nula de que as distribuições são normais.
    print(
        "Rejeitamos a Hipotese nula para p_av, portanto não é uma distribuição normal"
    )
else:
    print("É uma distribuição normal")

# In[101]:

# solucao teste de normalidade ou justificativa para nao utiliza-lo
from scipy.stats import ranksums
from statsmodels.stats.weightstats import zconfint

print("Intervalo de confiança para Horror = ",
      zconfint(votos_por_genero_por_filme.query("Horror > 0").Horror))
print("Intervalo de confiança para Aventura = ",
      zconfint(votos_por_genero_por_filme.query("Adventure > 0").Adventure))

# In[102]:

from statsmodels.stats.weightstats import ztest

notas_horror = votos_por_genero_por_filme.query("Horror > 0").Horror
notas_aventura = votos_por_genero_por_filme.query("Adventure > 0").Adventure

# solução com o teste desejado
print(ranksums(notas_horror, notas_aventura))

# ### Solução (explique sua conclusão):
#
예제 #14
0
        sumup_weak.append(tmin[(ids == i) & (scales_all == s) & (tmin <=-50)])


    for i in strong:
       # print(scales_all[(ids == i)])
        sumup_strong.append( tmin[(ids == i) & (scales_all == s) & (tmin <=-50)])

    for i in uids:
       # print(scales_all[(ids == i)])
        sumup_mean.append( tmin[(ids == i) & (scales_all == s) & (tmin <=-50)])

    weak_scales.append(np.nanmean(np.concatenate(sumup_weak)))
    strong_scales.append(np.nanmean(np.concatenate(sumup_strong)))
    mean_scales.append(np.nanmean(np.concatenate(sumup_mean)))

    wupper.append(stats.zconfint(np.concatenate(sumup_weak))[1])
    wlower.append(stats.zconfint(np.concatenate(sumup_weak))[0])
    supper.append(stats.zconfint(np.concatenate(sumup_strong))[1])
    slower.append(stats.zconfint(np.concatenate(sumup_strong))[0])



f = plt.figure()
plt.plot(uscales, np.array(weak_scales), label = 'Lowest Probability')
#plt.fill_between(uscales, wlower, wupper, alpha=0.3)
#plt.errorbar(uscales, weak_scales, xerr = w_std*2)
plt.plot(uscales, np.array(strong_scales), color='r', label = 'Highest probability')
#plt.fill_between(uscales, slower, supper, color='r', alpha=0.3)
#plt.plot(uscales, mean_scales, color='g', label = 'Average distribution')
plt.xlabel('Scales (km)')
plt.ylabel('Tmean(power max)')
예제 #15
0
# Давайте вернёмся к данным выживаемости пациентов с лейкоцитарной лимфомой из видео про критерий знаков:
# Измерено остаточное время жизни с момента начала наблюдения (в неделях);
# звёздочка обозначает цензурирование сверху — исследование длилось 7 лет, и остаточное время жизни одного пациента, который дожил до конца наблюдения, неизвестно.
#%%
import numpy as np
from scipy import stats
from statsmodels.stats.weightstats import zconfint

life_times = np.array([49, 58, 75, 110, 112, 132, 151, 276, 281, 362])  #∗
print "95%% confidence interval for the life time: [%f, %f]" % zconfint(
    life_times)
# Поскольку цензурировано только одно наблюдение, для проверки гипотезы H0:medX=200 на этих данных можно использовать критерий знаковых рангов — можно считать,
# что время дожития последнего пациента в точности равно 362, на ранг этого наблюдения это никак не повлияет.
# Критерием знаковых рангов проверьте эту гипотезу против двусторонней альтернативы, выведите достигаемый уровень значимости,
# округлённый до четырёх знаков после десятичной точки.
#%%
medX = 200
print "Wilcoxon criterion pvalue result: %.4f" % np.round(
    stats.wilcoxon(life_times - medX).pvalue, 4)

#%%
# В ходе исследования влияния лесозаготовки на биоразнообразие лесов острова Борнео собраны данные о количестве видов деревьев в 12 лесах, где вырубка не ведётся:
no_cut_kinds = np.array([22, 22, 15, 13, 19, 19, 18, 20, 21, 13, 13, 15])
# и в 9 лесах, где идёт вырубка:
cut_kinds = np.array([17, 18, 18, 15, 12, 4, 14, 15, 10])
# Проверьте гипотезу о равенстве среднего количества видов в двух типах лесов против односторонней альтернативы о снижении биоразнообразия в вырубаемых лесах.
# Используйте ранговый критерий. Чему равен достигаемый уровень значимости? Округлите до четырёх знаков после десятичной точки.
#%%
print "Mann-Whitney criterion pvalue result: %.4f" % np.round(
    stats.mannwhitneyu(no_cut_kinds, cut_kinds, alternative="greater").pvalue,
    4)
예제 #16
0
# Доверительный z-интервал

from statsmodels.stats.weightstats import zconfint

x = [1, 2, 1, 1, 1]
zint = zconfint(x, alpha=0.05, alternative='two-sided', ddof=1.0) # (0.8080072030919891, 1.5919927969080108)


# Доверительный t-интервал для среднего

from statsmodels.stats.weightstats import _tconfint_generic

x = [1, 2, 0, 3, 1, 1, 2, 4, 5, 6]
n = len(x)
mean = x.mean()
sigma = x.std(ddof=1)/math.sqrt(n)
_tconfint_generic(mean, sigma, n-1, 0.05, 'two-sided')  # (1.0994, 3.9006)) - 95% доверительный интервал для среднего


# Доверительный интервал для доли

from statsmodels.stats.proportion import proportion_confint

normal_interval = proportion_confint(n_positive, n_all, alpha=0.05 method = 'normal')  # 95% confident interval


# Размер выборки для интервала заданной ширины

from statsmodels.stats.proportion import samplesize_confint_proportion

n_samples = samplesize_confint_proportion(random_sample.mean(), half_length=0.01, alpha=0.05) # 95% confident interval
df.ftypes

# Descrição dos dados
df.describe()
"""----------------------------------------------------------------------------
    TESTE Z
Hzero = As medias sao iguais (mu_1 = mu_2)
"""
diagnostico_m = df.query("diagnosis == 'M'")
diagnostico_b = df.query("diagnosis == 'B'")

# Efetuando o Zteste para a comparacao entre as médias
ztest(diagnostico_m["mean_radius"], diagnostico_b["mean_radius"])

# Calculo do intervalo de confianca para a diferenca entre as medis
zconfint(diagnostico_m["mean_radius"], diagnostico_b["mean_radius"])
"""----------------------------------------------------------------------------
    TESTE T
Hzero = As medias sao iguais (mu_1 = mu_2)
"""
diagnostico_m = df.query("diagnosis == 'M'")
diagnostico_b = df.query("diagnosis == 'B'")

# Efetuando o Zteste para a comparacao entre as médias
ztest(diagnostico_m["mean_radius"], diagnostico_b["mean_radius"])

# Calculo do intervalo de confianca para a diferenca entre as medis
descr_stats_m = DescrStatsW(diagnostico_m["mean_radius"])
descr_stats_b = DescrStatsW(diagnostico_b["mean_radius"])
resultado = descr_stats_m.get_compare(descr_stats_b)
print(resultado.summary(use_t=True))
예제 #18
0
                 kde_kws={'cumulative': True}
                 )
ax.set(xlabel="Tempo de duração", ylabel="% de filmes")
ax.set_title("Tempo de duração em filmes no TMDB 5000")

tmdb.query("runtime>0").runtime.dropna().quantile(0.8)

"""# Movielens: média dos filmes com pelo menos 10 votos"""

print("Média dos filmes com pelo menos 10 votos", nota_media_dos_filmes_com_pelo_menos_10_votos.mean())

import matplotlib.pyplot as plt
import numpy as np

np.random.seed(75243)
temp = nota_media_dos_filmes_com_pelo_menos_10_votos.sample(frac=1)

medias = [temp[0:i].mean() for i in range(1, len(temp))]

plt.plot(medias)

from statsmodels.stats.weightstats import zconfint

zconfint(nota_media_dos_filmes_com_pelo_menos_10_votos)

from statsmodels.stats.weightstats import DescrStatsW

descr_todos_com_10_votos = DescrStatsW(nota_media_dos_filmes_com_pelo_menos_10_votos)
descr_todos_com_10_votos.tconfint_mean()