def var_ratio_ci_est(data1, data2, alpha): n1 = len(data1) n2 = len(data2) f_lower_value = f.ppf(alpha/2, n1-1, n2-1) f_upper_value = f.ppf(1-alpha/2, n1-1, n2-1) var_ratio = variance(data1)/variance(data2) return var_ratio/f_upper_value, var_ratio/f_lower_value
def t_test(data1, data2=None, tail='both', mu=0, equal=True): assert tail in ['both', 'left', 'right'], 'tail should be one of "both","left","right"' if data2 is None: mean_val = mean(data1) se = std(data1)/sqrt(len(data1)) t_val = (mean_val-mu)/se df = len(data1)-1 else: n1 = len(data1) n2 = len(data2) mean_diff = mean(data1)-mean(data2) sample1_var = variance(data1) sample2_var = variance(data2) if equal: sw = sqrt(((n1-1)*sample1_var+(n2-1)*sample2_var)/(n1+n2-2)) t_val = (mean_diff-mu)/(sw*sqrt(1/n1+1/n2)) df = n1 + n2 - 2 else: se = sqrt(sample1_var/n1+sample2_var/n2) t_val = (mean_diff-mu)/se df_numerator = (sample1_var/n1+sample2_var/n2)**2 df_denominator = (sample1_var/n1)**2/(n1-1) + \ (sample2_var/n2)**2/(n2-1) df = df_numerator/df_denominator if tail == "both": p = 2*(1-t.cdf(abs(t_val), df)) elif tail == "left": p = t.cdf(t_val, df) else: p = 1-t.cdf(t_val, df) return t_val, df, p
def var_ratio_ci_est(data1, data2, alpha): """两个总体方差未知,求方差比的置信区间""" n1 = len(data1) n2 = len(data2) sample_ratio = variance(data1) / variance(data2) f_low_value = f.ppf(alpha/2, n1-1, n2-1) f_high_value = f.ppf(1-alpha/2, n1-1, n2-1) return round(sample_ratio / f_high_value, 3), round(sample_ratio / f_low_value, 3)
def var_ci_est(data, alpha): """均值未知,方差的区间估计""" n = len(data) s2 = variance(data) chi2_low_value = chi2.ppf(alpha/2, n-1) chi2_high_value = chi2.ppf(1-alpha/2, n-1) return round((n-1)*s2/chi2_high_value,2), round((n-1)*s2/chi2_low_value,2)
def f_test(data1, data2, tail="both", ratio=1): assert tail in ['both', 'left', 'right'], 'tail should be one of "both","left","right"' n1 = len(data1) n2 = len(data2) sample1_var = variance(data1) sample2_var = variance(data2) f_val = sample1_var/sample2_var/ratio df1 = n1-1 df2 = n2-1 if tail == "both": p = 2*min(1-f.cdf(f_val, df1, df2), f.cdf(f_val, df1, df2)) elif tail == "left": p = f.cdf(f_val, df1, df2) else: p = 1-f.cdf(f_val, df1, df2) return f_val, df1, df2, p
def simple_linear_reg(x, y): assert len(x) == len(y) n = len(x) assert n > 1 mean_x = mean(x) mean_y = mean(y) beta1 = covariance(x, y)/variance(x) beta0 = mean_y-beta1*mean_x y_hat = [beta0+beta1*e for e in x] ss_residual = sum((e1-e2)**2 for e1, e2 in zip(y, y_hat)) se_model = sqrt(ss_residual/(n-2)) t_value = beta1/(se_model/sqrt((n-1)*variance(x))) p = 2*(1-t.cdf(abs(t_value), n-2)) return beta0, beta1, t_value, n-2, p
def t_test(data1, data2=None, tail='both', mu=0.0, equal=True): assert tail in ['both', 'left', 'right'], 'tail should be one of "both", "left", "right"' if data2 is None: # 单个总体的情况 mean_val = mean(data1) se = std(data1) / np.sqrt(len(data1)) t_val = (mean_val - mu) / se df = len(data1) - 1 else: # 两个总体的情况 n1 = len(data1) n2 = len(data2) mean_diff = mean(data1) - mean(data2) sample1_var = variance(data1) sample2_var = variance(data2) if equal: # 方差相等的情况 sw = np.sqrt((((n1 - 1) * sample1_var + (n2 - 1) * sample2_var)) / (n1 + n2 - 2)) t_val = (mean_diff - mu) / (sw * np.sqrt(1 / n2 + 1 / n2)) df = n1 + n2 - 2 else: # 方差不等的情况 se = np.sqrt(sample1_var / n1 + sample2_var / n2) t_val = (mean_diff - mu) / se df = (sample1_var / n1 + sample2_var / n2)**2 / ( (sample1_var / n1)**2 / (n1 - 1) + (sample2_var / n2)**2 / (n2 - 1)) if tail == 'both': # 双尾检验 p = 2 * (1 - t.cdf(abs(t_val), df)) elif tail == 'left': # 左尾检验 p = t.cdf(t_val, df) else: # 右尾检验 p = 1 - t.cdf(t_val, df) return round(t_val, 2), round(df, 2), p
def mean_diff_ci_t_est(data1, data2, alpha, equal=True): n1 = len(data1) n2 = len(data2) mean_diff = mean(data1)-mean(data2) sample1_var = variance(data1) sample2_var = variance(data2) if equal: sw = sqrt(((n1-1)*sample1_var+(n2-1)*sample2_var)/(n1+n2-2)) t_value = abs(t.ppf(alpha/2, n1+n2-2)) return mean_diff - sw*sqrt(1/n1+1/n2) * t_value, \ mean_diff + sw*sqrt(1/n1+1/n2) * t_value else: df_numerator = (sample1_var/n1+sample2_var/n2)**2 df_denominator = (sample1_var/n1)**2/(n1-1)+(sample2_var/n2)**2/(n2-1) df = df_numerator/df_denominator t_value = abs(t.ppf(alpha/2, df)) return mean_diff - sqrt(sample1_var/n1+sample2_var/n2)*t_value,\ mean_diff + sqrt(sample1_var/n1+sample2_var/n2)*t_value
def mean_diff_ci_t_est(data1, data2, alpha, equal=True): """总体方差未知, 求均值差的置信区间""" n1 = len(data1) n2 = len(data2) mean_diff = mean(data1) - mean(data2) sample1_var = variance(data1) sample2_var = variance(data2) if equal: """两总体方差未知且相等""" sw = np.sqrt(((n1-1)*sample1_var + (n2-1)*sample2_var) / (n1+n2-2)) t_value = abs(t.ppf(alpha/2, n1+n2-2)) return round(mean_diff - sw*np.sqrt(1/n1+1/n2)*t_value, 2), \ round(mean_diff + sw*np.sqrt(1/n1+1/n2)*t_value, 2) else: """两总体方差未知且不等""" df = (sample1_var/n1 + sample2_var/n2)**2 / ((sample1_var/n1)**2 / (n1-1) + (sample2_var/n2)**2 / (n2-1)) t_value = abs(t.ppf(alpha/2, df)) return round(mean_diff - np.sqrt(sample1_var/n1 + sample2_var/n2) * t_value, 2), \ round(mean_diff + np.sqrt(sample1_var/n1 + sample2_var/n2) * t_value, 2)
def var_ci_est(data, alpha): """ 总体均值未知,求方差的置信空间 data为传入的样本; alpha为需要传入的置信水平的值 """ n = len(data) #求样本容量 s2 = variance(data) #求样本方差 chi2_lower_value = chi2.ppf( alpha / 2, n - 1) #求坐标左侧Z面积,没错你没看错,因为数学证明的过程中是以右侧为基准的,但是scipy是以左侧为基准的 chi2_upper_value = chi2.ppf(1 - alpha / 2, n - 1) #求坐标右侧Z面积 return (n - 1) * s2 / chi2_upper_value, (n - 1) * s2 / chi2_lower_value
def f_test(data1, data2, tail='both', ratio=1): """两个总体""" assert tail in ['both', 'left', 'right'], 'tail should be one of "both", "left", "right"' n1 = len(data1) n2 = len(data2) sample_var1 = variance(data1) sample_var2 = variance(data2) f_val = sample_var1 / sample_var2 / ratio if tail == 'both': # 双尾检验 p = 2 * min(1 - f.cdf(f_val, n1 - 1, n2 - 1), f.cdf(f_val, n1 - 1, n2 - 1)) elif tail == 'left': # 左尾检验 p = f.cdf(f_val, n1 - 1, n2 - 1) else: # 右尾检验 p = 1 - f.cdf(f_val, n1 - 1, n2 - 1) return round(f_val, 4), n1 - 1, n2 - 1, round(p, 5)
def chi2_test(data, tail="both", sigma2=1): assert tail in ['both', 'left', 'right'], 'tail should be one of "both","left","right"' n = len(data) sample_var = variance(data) chi2_val = (n-1)*sample_var/sigma2 if tail == "both": p = 2*min(1-chi2.cdf(chi2_val, n-1), chi2.cdf(chi2_val, n-1)) elif tail == "left": p = chi2.cdf(chi2_val, n-1) else: p = 1-chi2.cdf(chi2_val, n-1) return chi2_val, n-1, p
def chi2_test(data, tail='both', sigma2=1): """单个总体""" assert tail in ['both', 'left', 'right'], 'tail should be one of "both", "left", "right"' n = len(data) sample_var = variance(data) chi2_val = (n - 1) * sample_var / sigma2 if tail == 'both': # 双尾检验 p = 2 * min(1 - chi2.cdf(chi2_val, n - 1), chi2.cdf(chi2_val, n - 1)) elif tail == 'left': # 左尾检验 p = chi2.cdf(chi2_val, n - 1) else: # 右尾检验 p = 1 - chi2.cdf(chi2_val, n - 1) return round(chi2_val, 2), n - 1, p
def var_ci_est(data, alpha): n = len(data) s2 = variance(data) chi2_lower_value = chi2.ppf(alpha/2, n-1) chi2_upper_value = chi2.ppf(1-alpha/2, n-1) return (n-1)*s2/chi2_upper_value, (n-1)*s2/chi2_lower_value
def var_one_sided_upper_ci_est(data,alpha): n = len(data) s2 = variance(data) chi2_lower_value = chi2.ppf(alpha, n - 1) # 接受的是左边的面积 求上限,分母是下分位点 return -inf, (n - 1) * s2 / chi2_lower_value
def var_one_sided_lower_ci_est(data,alpha): n = len(data) s2 = variance(data) chi2_upper_value = chi2.ppf(1 - alpha, n - 1) # 接受的是左边的面积 求下限,分母是上分位点 return (n - 1) * s2 / chi2_upper_value, inf
# 测试频率 print(frequency(data)) # 测试众数 print(mode(data)) # 测试中位数 print(median(data)) # 测试均值 print(mean(data)) # 测试极差 print(rng(data)) # 测试四分位数 print(quartile(data)) # 测试方差 print(variance(data)) # 测试标准差 print(std(data))
import random from playStats.descriptive_stats import mean, variance import matplotlib.pyplot as plt if __name__ == "__main__": sample_means = [] sample_vars = [] indices = [] for sz in range(20, 10001, 50): indices.append(sz) sample = [random.gauss(0.0, 1.0) for _ in range(sz)] sample_means.append(mean(sample)) sample_vars.append(variance(sample)) plt.plot(indices, sample_means) plt.plot(indices, sample_vars) plt.show()
import random from playStats.descriptive_stats import mean from playStats.descriptive_stats import variance chi2 = [] for i in range(50000): #x = random.random() #返回一个介于左闭右开[0.0, 1.0)区间的浮点数 x1 = random.normalvariate(0, 1) #返回一个均值是0,方差是1的正态分布随机数 x2 = random.normalvariate(0, 1) x3 = random.normalvariate(0, 1) x4 = random.normalvariate(0, 1) x5 = random.normalvariate(0, 1) x6 = random.normalvariate(0, 1) x7 = random.normalvariate(0, 1) x8 = random.normalvariate(0, 1) chi2.append(x1**2) # 演示一个自由度chi2分布 #chi2.append(x1**2+x2**2+x3**2+x4**2+x5**2+x6**2+x7**2+x8**2) # 演示多个自由度chi2分布 #chi2.append(random.normalvariate(0,1)) # 演示正态分布 #chi2.append(x) #演示uniform分布 print(variance(chi2)) #打印相关卡方分布方差 #plt.figure(num= "不同自由度卡方分布图") plt.hist(chi2, bins=30) plt.show() ''' x = np.linspace(-5,5,1000) f = 1/np.sqrt(2*np.pi)*np.exp(-x**2/2) plt.plot(x,f) plt.show() '''
总体均值未知,求方差的置信空间 data为传入的样本; alpha为需要传入的置信水平的值 """ n = len(data) #求样本容量 s2 = variance(data) #求样本方差 chi2_lower_value = chi2.ppf( alpha / 2, n - 1) #求坐标左侧Z面积,没错你没看错,因为数学证明的过程中是以右侧为基准的,但是scipy是以左侧为基准的 chi2_upper_value = chi2.ppf(1 - alpha / 2, n - 1) #求坐标右侧Z面积 return (n - 1) * s2 / chi2_upper_value, (n - 1) * s2 / chi2_lower_value if __name__ == '__main__': salary_18 = [1484, 785, 1598, 1366, 1716, 1020, 1716, 785, 3113, 1601] #18岁月收入数据 salary_35 = [902, 4508, 3809, 3923, 4276, 2065, 1601, 553, 3345, 2182] #35岁月收入数据 print(mean(salary_18)) #平均月收入的点估计 print(mean_ci_est(salary_18, 0.05)) #平均月收入的区间估计 print(mean(salary_35)) #平均月收入的点估计 print(mean_ci_est(salary_35, 0.05)) #平均月收入的区间估计 print() print(std(salary_18)) #整体方差的点估计开根 print(variance(salary_18)) #整体方差的点估计(样本方差) print(var_ci_est(salary_18, 0.05)) #区间估计 print(std(salary_35)) #整体方差的点估计开根 print(variance(salary_35)) #整体方差的点估计(样本方差) print(var_ci_est(salary_35, 0.05)) #区间估计 print()
import random from playStats.descriptive_stats import mean from playStats.descriptive_stats import variance import matplotlib.pyplot as plt if __name__ == '__main__': indices = [] data_mean = [] data_varvance = [] for sample_sz in range(20, 10001, 50): indices.append(sample_sz) sample = [random.gauss(0.0, 1.0) for _ in range(sample_sz)] data_mean.append(mean(sample)) data_varvance.append(variance(sample)) plt.plot(indices, data_mean) plt.axhline(0, c='r') plt.plot(indices, data_varvance) plt.axhline(1, c='b') plt.show()