Пример #1
0
def MC_simulation(times):
    sim = []
    for id in range(times):
        #print(f'\n-- {id+1} SIMULATION -- {id+1} SIMULATION -- {id+1} SIMULATION --\n')
        sim.append(simulation(id + 1))

    mean_a_days, std_a_days = stats_array_by_case(sim, n_days, times, 0)
    mean_he_days, std_he_days = stats_array_by_case(sim, n_days, times, 1)
    mean_l_days, std_l_days = stats_array_by_case(sim, n_days, times, 2)
    mean_de_days, std_de_days = stats_array_by_case(sim, n_days, times, 3)
    mean_re_days, std_re_days = stats_array_by_case(sim, n_days, times, 4)

    stats_graph(mean_a_days, mean_he_days, mean_l_days, mean_de_days,
                mean_re_days,
                'Monte-Carlo results after ' + str(times) + ' simulation')

    #оценки при помощи т-интервала (мы не знаем дисперсию)
    day_check = int(
        input('Enter the day number for which you want to see statistics: '))

    print(f'\nActual Active case mean in {day_check} day: ',
          mean_a_days[day_check])
    print(
        f'Active case mean 95% confidence interval in {day_check} day: ',
        _tconfint_generic(mean_a_days[day_check], std_a_days[day_check],
                          times - 1, 0.05, 'two-sided'))

    print(f'\nActual Heath case mean in {day_check} day: ',
          mean_he_days[day_check])
    print(
        f'Heath case mean 95% confidence interval in {day_check} day: ',
        _tconfint_generic(mean_he_days[day_check], std_he_days[day_check],
                          times - 1, 0.05, 'two-sided'))

    print(f'\nActual Latent case mean in {day_check} day: ',
          mean_l_days[day_check])
    print(
        f'Latent case mean 95% confidence interval in {day_check} day: ',
        _tconfint_generic(mean_l_days[day_check], std_l_days[day_check],
                          times - 1, 0.05, 'two-sided'))

    print(f'\nActual Die case mean in {day_check} day: ',
          mean_de_days[day_check])
    print(
        f'Die case mean 95% confidence interval in {day_check} day: ',
        _tconfint_generic(mean_de_days[day_check], std_de_days[day_check],
                          times - 1, 0.05, 'two-sided'))

    print(f'\nActual Recover case mean in {day_check} day: ',
          mean_re_days[day_check])
    print(
        f'Recover case mean 95% confidence interval in {day_check} day: ',
        _tconfint_generic(mean_re_days[day_check], std_re_days[day_check],
                          times - 1, 0.05, 'two-sided'))
Пример #2
0
def kfold_on_model(model, model_name, x_train, y_train, augmentation=0, vgg_prep=0, batch_norm=0):
    auc_scores = list()
    kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=3)
    for i, (train, test) in enumerate(kfold.split(x_train, y_train)):
        X_t = x_train[train]
        Y_t = y_train[train]
        X_v = x_train[test]
        Y_v = y_train[test]
        count = 0
        for each in Y_t:
            if each == 1:
                count += 1
        count = 0
        for each in Y_v:
            if each == 1:
                count += 1
        (pred, learning_time), time_on_single = model(X_t, Y_t, X_v, Y_v, i, model_name, augmentation,
                                                      vgg_prep, batch_norm)
        fpr, tpr, thresholds = roc_curve(Y_v, pred)
        auc_on_model = roc_auc_score(Y_v, pred)
        auc_scores.append(auc_on_model)
        add_stats_to_csv(model_name, auc_on_model, time_on_single, learning_time, i)
        plt.plot([0, 1], [0, 1], 'k--')
        plt.plot(fpr, tpr)
        plt.savefig(str(i) + model_name + "_roc_curve.png")
        plt.close()
    auc_scores = np.array(auc_scores)
    mean_std = auc_scores.std(ddof=1) / sqrt(len(auc_scores))
    beg, end = _tconfint_generic(auc_scores.mean(), mean_std, len(auc_scores) - 1, 0.05, 'two-sided')
    df = pd.read_csv('Confidence_intervals.csv')
    new_item = pd.DataFrame([[model_name, beg, end]],
                            columns=['Name', 'Start_of_interval', 'End_of_interval'])
    df = pd.concat([df, new_item])
    df.to_csv("Confidence_intervals.csv", index=False)
Пример #3
0
def confidence_interval(actual_price, predicted_price):
    reg_error = (actual_price - predicted_price)

    err_mean = reg_error.mean()
    err_mean_std = reg_error.std(ddof=1) / np.sqrt(len(reg_error))

    return _tconfint_generic(err_mean, err_mean_std,
                             len(reg_error) - 1, 0.05, 'two-sided')
Пример #4
0
 def _add_point_estimate_ci(self, df: DataFrame):
     df[CI_LOWER], df[CI_UPPER] = _tconfint_generic(
         mean=df[POINT_ESTIMATE],
         std_mean=np.sqrt(df[VARIANCE] / df[self._denominator]),
         dof=df[self._denominator] - 1,
         alpha=1 - self._interval_size,
         alternative=TWO_SIDED)
     return df
def confidence_interval(actual_price, predicted_price):
    reg_error = (actual_price - predicted_price)

    # доверительный интервал для ошибок модели
    err_mean = reg_error.mean()
    err_mean_std = reg_error.std(ddof=1) / np.sqrt(len(reg_error))

    return _tconfint_generic(err_mean, err_mean_std,
                             len(reg_error) - 1, 0.05, 'two-sided')
Пример #6
0
    def conf_int(self, value=None, alpha=0.05, alternative="two-sided"):
        """
        Confidence interval for probability that sample 1 has larger values

        Confidence interval is for the shifted probability

            P(x1 > x2) + 0.5 * P(x1 = x2) - value

        Parameters
        ----------
        value : float
            Value, default 0, shifts the confidence interval,
            e.g. ``value=0.5`` centers the confidence interval at zero.
        alpha : float
            Significance level for the confidence interval, coverage is
            ``1-alpha``
        alternative : str
            The alternative hypothesis, H1, has to be one of the following

               * 'two-sided' : H1: ``prob - value`` not equal to 0.
               * 'larger' :   H1: ``prob - value > 0``
               * 'smaller' :  H1: ``prob - value < 0``

        Returns
        -------
        lower : float or ndarray
            Lower confidence limit. This is -inf for the one-sided alternative
            "smaller".
        upper : float or ndarray
            Upper confidence limit. This is inf for the one-sided alternative
            "larger".

        """

        p0 = value
        if p0 is None:
            p0 = 0
        diff = self.prob1 - p0
        std_diff = np.sqrt(self.var / self.nobs)

        if self.use_t is False:
            return _zconfint_generic(diff, std_diff, alpha, alternative)
        else:
            return _tconfint_generic(diff, std_diff, self.df, alpha,
                                     alternative)
Пример #7
0
def clean_from_rare_dummies (ser, prefix, typ = "train", clean = True, count_na = False, other_name = "other", alpha = 0.03, train_feature_names = []):
      dummies = pd.get_dummies(ser, prefix = prefix, dummy_na = count_na)
      if clean: # если нужно очищать от редких
          if typ == "train": # если мы берем дамми по образу трейна, то нужно выбирать те фичи, которые были в трейне
              col_mean = dummies.mean(axis = 0)
              minimal_share_of_category = _tconfint_generic(np.mean(col_mean), np.std(col_mean, ddof = 1)/np.sqrt(len(col_mean)), len(col_mean) - 1, alpha, 'two-sided')[0]
              colnames_to_save = dummies.columns[col_mean >= minimal_share_of_category]
              colnames_to_drop = dummies.columns[col_mean < minimal_share_of_category]
          else:
              colnames_to_save = dummies.columns[dummies.columns[dummies.columns in train_feature_names]]
              colnames_to_drop = dummies.columns[dummies.columns[dummies.columns not in train_feature_names]]
          print("=====FEATURE "+prefix+"=====")
          print("total_categories:"+str(len(col_mean)))
          print("categories_to_save:"+str(len(colnames_to_save)))
          print("categories_to_drop:"+str(len(colnames_to_drop)))
          print("list_to_drop:", list(colnames_to_drop))
          if len(colnames_to_drop) > 0:
              dummies[prefix + "_" + other_name] = dummies[colnames_to_drop].sum(axis = 1)
              return pd.concat([dummies[colnames_to_save], dummies[prefix + "_" + other_name]], axis = 1)
          else:
              return dummies[colnames_to_save]
      else:
            return dummies
Пример #8
0
# check how many passengers in the sample paid for their ride with cash.
paid_cash = trips.payment_type==2
print("Sum of passangers who paid cash: ", paid_cash.sum())

# build a 99% confidence interval for the proportion of cash payers.
confidence_interval = proportion_confint(paid_cash.sum(), paid_cash.size, alpha=0.01)
print("Confidence interval:", confidence_interval)

trips_distance = trips["trip_distance"]
# estimate the average trip distance in miles
avg_distance = trips_distance.mean()
print("Avg trip distance:", avg_distance)

# standard deviation of the estimator from the previous question
dist_std_deviation = trips_distance.std(ddof=1)
print("Std dev", dist_std_deviation)

# calculate 95% confidence interval for the mean trip distance.
sqrt = np.sqrt(trips_distance.size)
s = trips_distance.std(ddof=1) / sqrt
print("Std dev of estimator", s)

mean_confidence_interval = _tconfint_generic(trips_distance.mean(), s, trips_distance.size - 1, 0.05, "two-sided")
print("Mean confidence interval:", mean_confidence_interval)





Пример #9
0
from math import sqrt
import pandas as pd
import numpy as np
from statsmodels.stats.weightstats import _zconfint_generic, _tconfint_generic

df = pd.read_csv('water.txt', delimiter='\\t')
std = df['mortality'].std(ddof=1) / sqrt(df.mortality.count())
mean = df['mortality'].mean()
alpha = 0.95

print('Mortality 95%% interval: %s' % str(
    _tconfint_generic(mean, std, df['mortality'].shape[0] - 1, 0.05,
                      'two-sided')))

water_data_south = df[df.location == 'South']
mort_mean_south = water_data_south['mortality'].mean()

mort_mean_south_std = water_data_south['mortality'].std() / np.sqrt(
    water_data_south['mortality'].shape[0])
print('Mortality south 95%% interval: %s' % str(
    _tconfint_generic(mort_mean_south, mort_mean_south_std,
                      water_data_south['mortality'].shape[0] - 1, 0.05,
                      'two-sided')))

from scipy import stats

print(np.ceil((stats.norm.ppf(1 - 0.05 / 2) / 0.1)**2))
Пример #10
0
def confidential_bounds(frame):
    std = frame.std(ddof=1)
    mean = frame.mean()
    count = frame.shape[0]
    return _tconfint_generic(mean, std / np.sqrt(count), count - 1, 0.05,
                             "two-sided")
Пример #11
0
 def _ci(self, row, alpha_column: str) -> Tuple[float, float]:
     return _tconfint_generic(mean=row[DIFFERENCE],
                              std_mean=row[STD_ERR],
                              dof=self._dof(row),
                              alpha=row[alpha_column],
                              alternative=row[PREFERENCE])
Пример #12
0
from statsmodels.stats.weightstats import zconfint

x = [1, 2, 1, 1, 1]
zint = zconfint(x, alpha=0.05, alternative='two-sided', ddof=1.0) # (0.8080072030919891, 1.5919927969080108)


# Доверительный t-интервал для среднего

from statsmodels.stats.weightstats import _tconfint_generic

x = [1, 2, 0, 3, 1, 1, 2, 4, 5, 6]
n = len(x)
mean = x.mean()
sigma = x.std(ddof=1)/math.sqrt(n)
_tconfint_generic(mean, sigma, n-1, 0.05, 'two-sided')  # (1.0994, 3.9006)) - 95% доверительный интервал для среднего


# Доверительный интервал для доли

from statsmodels.stats.proportion import proportion_confint

normal_interval = proportion_confint(n_positive, n_all, alpha=0.05 method = 'normal')  # 95% confident interval


# Размер выборки для интервала заданной ширины

from statsmodels.stats.proportion import samplesize_confint_proportion

n_samples = samplesize_confint_proportion(random_sample.mean(), half_length=0.01, alpha=0.05) # 95% confident interval
n_samples = int(np.ceil(n_samples)) # интервал ширины 0.02
# -*- coding: utf-8 -*-
"""
Created on Tue Oct 16 15:32:25 2018

@author: kazantseva
"""

import pandas as pd
from statsmodels.stats.weightstats import _zconfint_generic, _tconfint_generic

data = pd.read_csv('water.txt', sep = '\t')

mean_mortality = data['mortality'].mean()
std_mortality = data['mortality'].std(ddof = 1)

interval = _tconfint_generic(mean_mortality, std_mortality/len(data)**0.5, len(data) - 1, 0.05, 'two-sided')

mean_mortality_s = data[data['location'] == 'South']['mortality'].mean()
std_mortality_s = data[data['location'] == 'South']['mortality'].std(ddof = 1)

interval_s = _tconfint_generic(mean_mortality_s, std_mortality_s/len(data[data['location'] == 'South'])**0.5, len(data[data['location'] == 'South']) - 1, 0.05, 'two-sided')

mean_mortality_n = data[data['location'] == 'North']['mortality'].mean()
std_mortality_n = data[data['location'] == 'North']['mortality'].std(ddof = 1)

interval_n = _tconfint_generic(mean_mortality_n, std_mortality_n/len(data[data['location'] == 'North'])**0.5, len(data[data['location'] == 'North']) - 1, 0.05, 'two-sided')

mean_hardness_s = data[data['location'] == 'South']['hardness'].mean()
std_hardness_s = data[data['location'] == 'South']['hardness'].std(ddof = 1)

interval_s_h = _tconfint_generic(mean_hardness_s, std_hardness_s/len(data[data['location'] == 'South'])**0.5, len(data[data['location'] == 'South']) - 1, 0.05, 'two-sided')