def MC_simulation(times): sim = [] for id in range(times): #print(f'\n-- {id+1} SIMULATION -- {id+1} SIMULATION -- {id+1} SIMULATION --\n') sim.append(simulation(id + 1)) mean_a_days, std_a_days = stats_array_by_case(sim, n_days, times, 0) mean_he_days, std_he_days = stats_array_by_case(sim, n_days, times, 1) mean_l_days, std_l_days = stats_array_by_case(sim, n_days, times, 2) mean_de_days, std_de_days = stats_array_by_case(sim, n_days, times, 3) mean_re_days, std_re_days = stats_array_by_case(sim, n_days, times, 4) stats_graph(mean_a_days, mean_he_days, mean_l_days, mean_de_days, mean_re_days, 'Monte-Carlo results after ' + str(times) + ' simulation') #оценки при помощи т-интервала (мы не знаем дисперсию) day_check = int( input('Enter the day number for which you want to see statistics: ')) print(f'\nActual Active case mean in {day_check} day: ', mean_a_days[day_check]) print( f'Active case mean 95% confidence interval in {day_check} day: ', _tconfint_generic(mean_a_days[day_check], std_a_days[day_check], times - 1, 0.05, 'two-sided')) print(f'\nActual Heath case mean in {day_check} day: ', mean_he_days[day_check]) print( f'Heath case mean 95% confidence interval in {day_check} day: ', _tconfint_generic(mean_he_days[day_check], std_he_days[day_check], times - 1, 0.05, 'two-sided')) print(f'\nActual Latent case mean in {day_check} day: ', mean_l_days[day_check]) print( f'Latent case mean 95% confidence interval in {day_check} day: ', _tconfint_generic(mean_l_days[day_check], std_l_days[day_check], times - 1, 0.05, 'two-sided')) print(f'\nActual Die case mean in {day_check} day: ', mean_de_days[day_check]) print( f'Die case mean 95% confidence interval in {day_check} day: ', _tconfint_generic(mean_de_days[day_check], std_de_days[day_check], times - 1, 0.05, 'two-sided')) print(f'\nActual Recover case mean in {day_check} day: ', mean_re_days[day_check]) print( f'Recover case mean 95% confidence interval in {day_check} day: ', _tconfint_generic(mean_re_days[day_check], std_re_days[day_check], times - 1, 0.05, 'two-sided'))
def kfold_on_model(model, model_name, x_train, y_train, augmentation=0, vgg_prep=0, batch_norm=0): auc_scores = list() kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=3) for i, (train, test) in enumerate(kfold.split(x_train, y_train)): X_t = x_train[train] Y_t = y_train[train] X_v = x_train[test] Y_v = y_train[test] count = 0 for each in Y_t: if each == 1: count += 1 count = 0 for each in Y_v: if each == 1: count += 1 (pred, learning_time), time_on_single = model(X_t, Y_t, X_v, Y_v, i, model_name, augmentation, vgg_prep, batch_norm) fpr, tpr, thresholds = roc_curve(Y_v, pred) auc_on_model = roc_auc_score(Y_v, pred) auc_scores.append(auc_on_model) add_stats_to_csv(model_name, auc_on_model, time_on_single, learning_time, i) plt.plot([0, 1], [0, 1], 'k--') plt.plot(fpr, tpr) plt.savefig(str(i) + model_name + "_roc_curve.png") plt.close() auc_scores = np.array(auc_scores) mean_std = auc_scores.std(ddof=1) / sqrt(len(auc_scores)) beg, end = _tconfint_generic(auc_scores.mean(), mean_std, len(auc_scores) - 1, 0.05, 'two-sided') df = pd.read_csv('Confidence_intervals.csv') new_item = pd.DataFrame([[model_name, beg, end]], columns=['Name', 'Start_of_interval', 'End_of_interval']) df = pd.concat([df, new_item]) df.to_csv("Confidence_intervals.csv", index=False)
def confidence_interval(actual_price, predicted_price): reg_error = (actual_price - predicted_price) err_mean = reg_error.mean() err_mean_std = reg_error.std(ddof=1) / np.sqrt(len(reg_error)) return _tconfint_generic(err_mean, err_mean_std, len(reg_error) - 1, 0.05, 'two-sided')
def _add_point_estimate_ci(self, df: DataFrame): df[CI_LOWER], df[CI_UPPER] = _tconfint_generic( mean=df[POINT_ESTIMATE], std_mean=np.sqrt(df[VARIANCE] / df[self._denominator]), dof=df[self._denominator] - 1, alpha=1 - self._interval_size, alternative=TWO_SIDED) return df
def confidence_interval(actual_price, predicted_price): reg_error = (actual_price - predicted_price) # доверительный интервал для ошибок модели err_mean = reg_error.mean() err_mean_std = reg_error.std(ddof=1) / np.sqrt(len(reg_error)) return _tconfint_generic(err_mean, err_mean_std, len(reg_error) - 1, 0.05, 'two-sided')
def conf_int(self, value=None, alpha=0.05, alternative="two-sided"): """ Confidence interval for probability that sample 1 has larger values Confidence interval is for the shifted probability P(x1 > x2) + 0.5 * P(x1 = x2) - value Parameters ---------- value : float Value, default 0, shifts the confidence interval, e.g. ``value=0.5`` centers the confidence interval at zero. alpha : float Significance level for the confidence interval, coverage is ``1-alpha`` alternative : str The alternative hypothesis, H1, has to be one of the following * 'two-sided' : H1: ``prob - value`` not equal to 0. * 'larger' : H1: ``prob - value > 0`` * 'smaller' : H1: ``prob - value < 0`` Returns ------- lower : float or ndarray Lower confidence limit. This is -inf for the one-sided alternative "smaller". upper : float or ndarray Upper confidence limit. This is inf for the one-sided alternative "larger". """ p0 = value if p0 is None: p0 = 0 diff = self.prob1 - p0 std_diff = np.sqrt(self.var / self.nobs) if self.use_t is False: return _zconfint_generic(diff, std_diff, alpha, alternative) else: return _tconfint_generic(diff, std_diff, self.df, alpha, alternative)
def clean_from_rare_dummies (ser, prefix, typ = "train", clean = True, count_na = False, other_name = "other", alpha = 0.03, train_feature_names = []): dummies = pd.get_dummies(ser, prefix = prefix, dummy_na = count_na) if clean: # если нужно очищать от редких if typ == "train": # если мы берем дамми по образу трейна, то нужно выбирать те фичи, которые были в трейне col_mean = dummies.mean(axis = 0) minimal_share_of_category = _tconfint_generic(np.mean(col_mean), np.std(col_mean, ddof = 1)/np.sqrt(len(col_mean)), len(col_mean) - 1, alpha, 'two-sided')[0] colnames_to_save = dummies.columns[col_mean >= minimal_share_of_category] colnames_to_drop = dummies.columns[col_mean < minimal_share_of_category] else: colnames_to_save = dummies.columns[dummies.columns[dummies.columns in train_feature_names]] colnames_to_drop = dummies.columns[dummies.columns[dummies.columns not in train_feature_names]] print("=====FEATURE "+prefix+"=====") print("total_categories:"+str(len(col_mean))) print("categories_to_save:"+str(len(colnames_to_save))) print("categories_to_drop:"+str(len(colnames_to_drop))) print("list_to_drop:", list(colnames_to_drop)) if len(colnames_to_drop) > 0: dummies[prefix + "_" + other_name] = dummies[colnames_to_drop].sum(axis = 1) return pd.concat([dummies[colnames_to_save], dummies[prefix + "_" + other_name]], axis = 1) else: return dummies[colnames_to_save] else: return dummies
# check how many passengers in the sample paid for their ride with cash. paid_cash = trips.payment_type==2 print("Sum of passangers who paid cash: ", paid_cash.sum()) # build a 99% confidence interval for the proportion of cash payers. confidence_interval = proportion_confint(paid_cash.sum(), paid_cash.size, alpha=0.01) print("Confidence interval:", confidence_interval) trips_distance = trips["trip_distance"] # estimate the average trip distance in miles avg_distance = trips_distance.mean() print("Avg trip distance:", avg_distance) # standard deviation of the estimator from the previous question dist_std_deviation = trips_distance.std(ddof=1) print("Std dev", dist_std_deviation) # calculate 95% confidence interval for the mean trip distance. sqrt = np.sqrt(trips_distance.size) s = trips_distance.std(ddof=1) / sqrt print("Std dev of estimator", s) mean_confidence_interval = _tconfint_generic(trips_distance.mean(), s, trips_distance.size - 1, 0.05, "two-sided") print("Mean confidence interval:", mean_confidence_interval)
from math import sqrt import pandas as pd import numpy as np from statsmodels.stats.weightstats import _zconfint_generic, _tconfint_generic df = pd.read_csv('water.txt', delimiter='\\t') std = df['mortality'].std(ddof=1) / sqrt(df.mortality.count()) mean = df['mortality'].mean() alpha = 0.95 print('Mortality 95%% interval: %s' % str( _tconfint_generic(mean, std, df['mortality'].shape[0] - 1, 0.05, 'two-sided'))) water_data_south = df[df.location == 'South'] mort_mean_south = water_data_south['mortality'].mean() mort_mean_south_std = water_data_south['mortality'].std() / np.sqrt( water_data_south['mortality'].shape[0]) print('Mortality south 95%% interval: %s' % str( _tconfint_generic(mort_mean_south, mort_mean_south_std, water_data_south['mortality'].shape[0] - 1, 0.05, 'two-sided'))) from scipy import stats print(np.ceil((stats.norm.ppf(1 - 0.05 / 2) / 0.1)**2))
def confidential_bounds(frame): std = frame.std(ddof=1) mean = frame.mean() count = frame.shape[0] return _tconfint_generic(mean, std / np.sqrt(count), count - 1, 0.05, "two-sided")
def _ci(self, row, alpha_column: str) -> Tuple[float, float]: return _tconfint_generic(mean=row[DIFFERENCE], std_mean=row[STD_ERR], dof=self._dof(row), alpha=row[alpha_column], alternative=row[PREFERENCE])
from statsmodels.stats.weightstats import zconfint x = [1, 2, 1, 1, 1] zint = zconfint(x, alpha=0.05, alternative='two-sided', ddof=1.0) # (0.8080072030919891, 1.5919927969080108) # Доверительный t-интервал для среднего from statsmodels.stats.weightstats import _tconfint_generic x = [1, 2, 0, 3, 1, 1, 2, 4, 5, 6] n = len(x) mean = x.mean() sigma = x.std(ddof=1)/math.sqrt(n) _tconfint_generic(mean, sigma, n-1, 0.05, 'two-sided') # (1.0994, 3.9006)) - 95% доверительный интервал для среднего # Доверительный интервал для доли from statsmodels.stats.proportion import proportion_confint normal_interval = proportion_confint(n_positive, n_all, alpha=0.05 method = 'normal') # 95% confident interval # Размер выборки для интервала заданной ширины from statsmodels.stats.proportion import samplesize_confint_proportion n_samples = samplesize_confint_proportion(random_sample.mean(), half_length=0.01, alpha=0.05) # 95% confident interval n_samples = int(np.ceil(n_samples)) # интервал ширины 0.02
# -*- coding: utf-8 -*- """ Created on Tue Oct 16 15:32:25 2018 @author: kazantseva """ import pandas as pd from statsmodels.stats.weightstats import _zconfint_generic, _tconfint_generic data = pd.read_csv('water.txt', sep = '\t') mean_mortality = data['mortality'].mean() std_mortality = data['mortality'].std(ddof = 1) interval = _tconfint_generic(mean_mortality, std_mortality/len(data)**0.5, len(data) - 1, 0.05, 'two-sided') mean_mortality_s = data[data['location'] == 'South']['mortality'].mean() std_mortality_s = data[data['location'] == 'South']['mortality'].std(ddof = 1) interval_s = _tconfint_generic(mean_mortality_s, std_mortality_s/len(data[data['location'] == 'South'])**0.5, len(data[data['location'] == 'South']) - 1, 0.05, 'two-sided') mean_mortality_n = data[data['location'] == 'North']['mortality'].mean() std_mortality_n = data[data['location'] == 'North']['mortality'].std(ddof = 1) interval_n = _tconfint_generic(mean_mortality_n, std_mortality_n/len(data[data['location'] == 'North'])**0.5, len(data[data['location'] == 'North']) - 1, 0.05, 'two-sided') mean_hardness_s = data[data['location'] == 'South']['hardness'].mean() std_hardness_s = data[data['location'] == 'South']['hardness'].std(ddof = 1) interval_s_h = _tconfint_generic(mean_hardness_s, std_hardness_s/len(data[data['location'] == 'South'])**0.5, len(data[data['location'] == 'South']) - 1, 0.05, 'two-sided')