예제 #1
0
def t_test(data1, data2=None, tail='both', mu=0, equal=True):
    assert tail in ['both', 'left',
                    'right'], 'tail should be one of "both","left","right"'
    if data2 is None:
        mean_val = mean(data1)
        se = std(data1)/sqrt(len(data1))
        t_val = (mean_val-mu)/se
        df = len(data1)-1
    else:
        n1 = len(data1)
        n2 = len(data2)
        mean_diff = mean(data1)-mean(data2)
        sample1_var = variance(data1)
        sample2_var = variance(data2)
        if equal:
            sw = sqrt(((n1-1)*sample1_var+(n2-1)*sample2_var)/(n1+n2-2))
            t_val = (mean_diff-mu)/(sw*sqrt(1/n1+1/n2))
            df = n1 + n2 - 2
        else:
            se = sqrt(sample1_var/n1+sample2_var/n2)
            t_val = (mean_diff-mu)/se
            df_numerator = (sample1_var/n1+sample2_var/n2)**2
            df_denominator = (sample1_var/n1)**2/(n1-1) + \
                (sample2_var/n2)**2/(n2-1)
            df = df_numerator/df_denominator
    if tail == "both":
        p = 2*(1-t.cdf(abs(t_val), df))
    elif tail == "left":
        p = t.cdf(t_val, df)
    else:
        p = 1-t.cdf(t_val, df)
    return t_val, df, p
예제 #2
0
def mean_diff_ci_z_est(data1,data2,alpha,sigma1,sigma2):
    n1 = len(data1)
    n2 = len(data2)
    mean_diff = mean(data1) - mean(data2)
    z_value = abs(norm.ppf(alpha/2))
    return mean_diff - sqrt(sigma1 / n1 + sigma2/ n2) * z_value, \
           mean_diff + sqrt(sigma1 / n1 + sigma2 / n2) * z_value
def anova_twoway(data):
    """双因素方差分析2×2"""
    r, s = 2, 2
    data = np.array(data)
    group_szs = np.tile(np.size(data, axis=1), (np.size(data, axis=0), 1))
    n = sum(group_szs)  # 样本总数

    # 计算均值
    group_means = np.mean(data, axis=1)
    group_mean = group_means.dot(group_szs) / n
    group_i_means = np.array([mean(group_means[:2]), mean(group_means[2:])])
    group_j_means = np.array([(group_means[0] + group_means[2]) / 2,
                              (group_means[1] + group_means[3]) / 2])

    # 计算i,j各水平的效应
    group_i_effect = group_i_means - group_mean
    group_j_effect = group_j_means - group_mean
    # 计算i, j的交叉效应
    group_ij_effect = (group_means.reshape(2, 2) - np.tile(
        group_mean,
        (2, 2))) - np.tile(group_i_effect,
                           (2, 1)).T - np.tile(group_j_effect, (2, 1))

    # 计算总变化
    sst = np.sum((data - group_mean)**2)
    # 计算第一个因素引起的变化
    ss_method = ((group_i_means - group_mean)**2).dot(
        [np.sum(group_szs[:2]), np.sum(group_szs[2:])])
    # 计算第二个因素引起的变化
    ss_reward = ((group_j_means - group_mean)**2).dot([
        np.sum([group_szs[0], group_szs[2]]),
        np.sum([group_szs[1], group_szs[3]])
    ])
    # 计算第一个因素与第二个因素交互引起的变化
    ss_mr = (group_ij_effect.reshape(1, 4)**2).dot(group_szs)
    # 其他因素引起的变化
    ss_error = np.sum((data - group_means.reshape(-1, 1))**2)

    # 计算其他因素引起的误差
    ms_error = ss_error / (n - r * s)
    # 计算第一个因素引起的变化ms值, f值, p值
    ms_method = ss_method / (r - 1)
    f_ms_method = ms_method / ms_error
    p_ms_method = 1 - f.cdf(f_ms_method, r - 1, n - r * s)
    # 计算第二个因素引起的变化ms值, f值, p值
    ms_reward = ss_reward / (r - 1)
    f_ms_reward = ms_reward / ms_error
    p_ms_reward = 1 - f.cdf(f_ms_reward, r - 1, n - r * s)
    # 计算第一、二个因素交互引起的变化ms值, f值, p值
    ms_mr = ss_mr / (r - 1)
    f_ms_mr = ms_mr / ms_error
    p_ms_mr = 1 - f.cdf(f_ms_mr, r - 1, n - r * s)

    # 整理输出矩阵各行
    method = [r - 1, ss_method, ms_method, f_ms_method, p_ms_method]
    reward = [r - 1, ss_reward, ms_reward, f_ms_reward, p_ms_reward]
    mr = [r - 1, ss_mr, ms_mr, f_ms_mr, p_ms_mr]
    residuals = [n - r * s, ss_error, ms_error, None, None]

    return np.array([method, reward, mr, residuals]).astype(np.float32)
def z_test(data1, data2=None, tail="both", mu=0.0, sigma1=1.0, sigma2=None):

    assert tail in ['both', 'left',
                    'right'], 'tail should be one of "both", "left", "right"'

    if data2 is None:
        # 单个总体的情况
        mean_val = mean(data1)
        se = sigma1 / np.sqrt(len(data1))
        z_val = (mean_val - mu) / se
    else:
        # 两个总体的情况
        assert sigma2 is not None
        mean_diff = mean(data1) - mean(data2)
        se = np.sqrt(sigma1**2 / len(data1) + sigma2**2 / len(data2))
        z_val = (mean_diff - mu) / se

    if tail == 'both':
        # 双尾检验
        p = 2 * (1 - norm.cdf(abs(z_val)))
    elif tail == 'left':
        # 左尾检验
        p = norm.cdf(z_val)
    else:
        # 右尾检验
        p = 1 - norm.cdf(z_val)

    return round(z_val, 2), p
예제 #5
0
def mean_diff_ci_z_est(data1, data2, alpha, sigma1, sigma2):
    """两个总体方差已知,求均值差的置信区间"""
    n1 = len(data1)
    n2 = len(data2)
    mean_diff = mean(data1) - mean(data2)
    z_value = abs(norm.ppf(alpha/2))
    return round(mean_diff - np.sqrt(sigma1**2/n1 + sigma2**2/n2) * z_value, 2), \
           round(mean_diff + np.sqrt(sigma1**2/n1 + sigma2**2/n2) * z_value, 2)
예제 #6
0
def mean_one_sided_lower_ci_est(data,alpha,sigma = None):
    n = len(data)
    if sigma is None:   # 未知总体方差,使用t分布
        t_value = abs(t.ppf(alpha,n-1))
        s = std(data)
        return  mean(data) - s / sqrt(n) * t_value,inf

    else: # 知道总体方差,使用标准正态分布
        z_value = abs(norm.ppf(alpha))
        return mean(data) - sigma/sqrt(n) * z_value,inf
예제 #7
0
def variance_bias(data):
    """有偏方差"""
    if data is None or len(data) <= 1:
        return None
    n = len(data)
    mean_value = mean(data)
    return sum((e - mean_value)**2 for e in data) / n
예제 #8
0
def anova_oneway(data):
    k = len(data)
    assert k > 1

    group_means = [mean(group) for group in data]
    group_szs = [len(group) for group in data]
    n = sum(group_szs)
    assert n > k

    grand_mean = sum(group_mean * group_sz for group_mean,
                     group_sz in zip(group_means, group_szs))/n

    sst = sum(sum((y-grand_mean)**2 for y in group)for group in data)
    ssg = sum((group_mean-grand_mean)**2*group_sz for group_mean,
              group_sz in zip(group_means, group_szs))
    sse = sst-ssg

    dfg = k-1
    dfe = n-k
    msg = ssg/dfg
    mse = sse/dfe

    f_value = msg/mse
    p = 1-f.cdf(f_value, dfg, dfe)

    return f_value, dfg, dfe, p
def anova_oneway(data):
    """单因素方差分析"""
    k = len(data)  # 类别数
    assert k > 1, '数据量得大于1'

    group_means = [mean(group) for group in data]
    group_szs = [len(group) for group in data]
    n = sum(group_szs)  # 每个类别中元素个数之和,即数据总个数
    assert n > k

    group_mean = sum(
        group_mean * group_sz
        for group_mean, group_sz in zip(group_means, group_szs)) / n
    sst = np.sum((np.array(data) - group_mean)**2)
    ssg = ((np.array(group_means) - group_mean)**2).dot(np.array(group_szs))
    sse = np.sum((np.array(data) - np.array(group_means).reshape(-1, 1))**2)
    assert round(sse, 2) == round(sst - ssg, 2)

    dfg = k - 1
    dfe = n - k

    msg = ssg / dfg
    mse = sse / dfe

    f_value = msg / mse
    p = 1 - f.cdf(f_value, dfg, dfe)

    return round(f_value, 2), dfg, dfe, p
예제 #10
0
def variance_bias(data):
    """有偏方差(因为除以n)"""
    n=len(data)
    if n<=1:
        return None
    mean_value=mean(data)
    return sum((e-mean_value)**2 for e in data)/n
예제 #11
0
def sample(num_of_samples,sample_sz):
    '''
    返回样本数为10000,样本容量为40的满足均匀分布的样本均值列表
    '''
    data=[]
    for _ in range(num_of_samples):
        data.append(mean([random.uniform(0.0,1.0) for _ in range(sample_sz)]))
    return data
예제 #12
0
def t_test(data1, data2=None, tail='both', mu=0.0, equal=True):

    assert tail in ['both', 'left',
                    'right'], 'tail should be one of "both", "left", "right"'

    if data2 is None:
        # 单个总体的情况
        mean_val = mean(data1)
        se = std(data1) / np.sqrt(len(data1))
        t_val = (mean_val - mu) / se
        df = len(data1) - 1
    else:
        # 两个总体的情况
        n1 = len(data1)
        n2 = len(data2)
        mean_diff = mean(data1) - mean(data2)
        sample1_var = variance(data1)
        sample2_var = variance(data2)
        if equal:
            # 方差相等的情况
            sw = np.sqrt((((n1 - 1) * sample1_var + (n2 - 1) * sample2_var)) /
                         (n1 + n2 - 2))
            t_val = (mean_diff - mu) / (sw * np.sqrt(1 / n2 + 1 / n2))
            df = n1 + n2 - 2
        else:
            # 方差不等的情况
            se = np.sqrt(sample1_var / n1 + sample2_var / n2)
            t_val = (mean_diff - mu) / se
            df = (sample1_var / n1 + sample2_var / n2)**2 / (
                (sample1_var / n1)**2 / (n1 - 1) + (sample2_var / n2)**2 /
                (n2 - 1))

    if tail == 'both':
        # 双尾检验
        p = 2 * (1 - t.cdf(abs(t_val), df))
    elif tail == 'left':
        # 左尾检验
        p = t.cdf(t_val, df)
    else:
        # 右尾检验
        p = 1 - t.cdf(t_val, df)

    return round(t_val, 2), round(df, 2), p
예제 #13
0
def simple_linear_reg(x, y):
    assert len(x) == len(y)
    n = len(x)
    assert n > 1

    mean_x = mean(x)
    mean_y = mean(y)

    beta1 = covariance(x, y)/variance(x)
    beta0 = mean_y-beta1*mean_x

    y_hat = [beta0+beta1*e for e in x]
    ss_residual = sum((e1-e2)**2 for e1, e2 in zip(y, y_hat))
    se_model = sqrt(ss_residual/(n-2))

    t_value = beta1/(se_model/sqrt((n-1)*variance(x)))
    p = 2*(1-t.cdf(abs(t_value), n-2))

    return beta0, beta1, t_value, n-2, p
예제 #14
0
def mean_diff_ci_t_est(data1, data2, alpha, equal=True):
    """总体方差未知, 求均值差的置信区间"""
    n1 = len(data1)
    n2 = len(data2)
    mean_diff = mean(data1) - mean(data2)
    sample1_var = variance(data1)
    sample2_var = variance(data2)

    if equal:
        """两总体方差未知且相等"""
        sw = np.sqrt(((n1-1)*sample1_var + (n2-1)*sample2_var) / (n1+n2-2))
        t_value = abs(t.ppf(alpha/2, n1+n2-2))
        return round(mean_diff - sw*np.sqrt(1/n1+1/n2)*t_value, 2), \
               round(mean_diff + sw*np.sqrt(1/n1+1/n2)*t_value, 2)
    else:
        """两总体方差未知且不等"""
        df = (sample1_var/n1 + sample2_var/n2)**2 / ((sample1_var/n1)**2 / (n1-1) + (sample2_var/n2)**2 / (n2-1))
        t_value = abs(t.ppf(alpha/2, df))
        return round(mean_diff - np.sqrt(sample1_var/n1 + sample2_var/n2) * t_value, 2), \
               round(mean_diff + np.sqrt(sample1_var/n1 + sample2_var/n2) * t_value, 2)
예제 #15
0
def mean_diff_ci_t_est(data1, data2, alpha, equal=True):
    n1 = len(data1)
    n2 = len(data2)
    mean_diff = mean(data1)-mean(data2)

    sample1_var = variance(data1)
    sample2_var = variance(data2)

    if equal:
        sw = sqrt(((n1-1)*sample1_var+(n2-1)*sample2_var)/(n1+n2-2))
        t_value = abs(t.ppf(alpha/2, n1+n2-2))
        return mean_diff - sw*sqrt(1/n1+1/n2) * t_value, \
            mean_diff + sw*sqrt(1/n1+1/n2) * t_value
    else:
        df_numerator = (sample1_var/n1+sample2_var/n2)**2
        df_denominator = (sample1_var/n1)**2/(n1-1)+(sample2_var/n2)**2/(n2-1)
        df = df_numerator/df_denominator
        t_value = abs(t.ppf(alpha/2, df))
        return mean_diff - sqrt(sample1_var/n1+sample2_var/n2)*t_value,\
            mean_diff + sqrt(sample1_var/n1+sample2_var/n2)*t_value
예제 #16
0
def z_test(data1, data2=None, tail="both", mu=0, sigma1=1, sigma2=None):
    assert tail in ["both", "left",
                    "right"], 'tail should be one of "both","left","right"'

    if data2 is None:
        mean_val = mean(data1)
        se = sigma1/sqrt(len(data1))
        z_val = (mean_val-mu)/se
    else:
        assert sigma2 is not None
        mean_diff = mean(data1)-mean(data2)
        se = sqrt(sigma1**2/len(data1)+sigma2**2/len(data2))
        z_val = (mean_diff-mu)/se

    if tail == "both":
        p = 2*(1-norm.cdf(abs(z_val)))
    elif tail == "left":
        p = norm.cdf(z_val)
    else:
        p = 1-norm.cdf(z_val)
    return z_val, p
예제 #17
0
def mean_ci_est(data, alpha, sigma=None):  # confidence interval
    n = len(data)
    sample_mean = mean(data)

    if sigma is None:
        s = std(data)
        se = s/sqrt(n)
        t_value = abs(t.ppf(alpha/2,n-1))
        return sample_mean - se * t_value, sample_mean + se * t_value
    else:
        se = sigma/sqrt(n)
        z_value = abs(norm.ppf(alpha / 2)) # ppf默认下分位点,故使用abs
        return sample_mean - se * z_value, sample_mean + se * z_value
예제 #18
0
def mean_ci_est(data, alpha, sigma=None):
    n = len(data)
    sample_mean = mean(data)

    if sigma is None:
        # 方差未知
        s = std(data)
        se = s/sqrt(n)
        t_value = abs(t.ppf(alpha/2, n-1))
        return sample_mean - se * t_value, sample_mean + se * t_value
    else:
        # 方差已知
        se = sigma/sqrt(n)
        z_value = abs(norm.ppf(alpha/2))
        return sample_mean - se * z_value, sample_mean + se * z_value
예제 #19
0
def mean_ci_est(data, alpha, sigma=None):
    """均值的区间估计"""
    n = len(data)
    sample_mean = mean(data)

    if sigma is None:
        # 方差未知
        s = std(data)
        me = s / np.sqrt(n)
        t_value = abs(t.ppf(alpha/2, n-1))
        return round(sample_mean - me * t_value, 2), round(sample_mean + me * t_value, 2)
    else:
        # 方差已知
        me = sigma / np.sqrt(n)
        z_value = abs(norm.ppf(alpha/2))
        return round(sample_mean - me * z_value, 2), round(sample_mean + me * z_value, 2)
def mean_ci_est(data, alpha, sigma=None):
    """
        总体方差未知,求均值的置信空间
        总体方差已知,求均值的置信空间
        data为传入的样本; alpha,sigma为需要传入的置信水平,sigma的值
    """
    n = len(data)  #求样本容量
    sample_mean = mean(data)  #求样本均值

    if sigma is None:  #方差未知
        s = std(data)  #求样本方差
        se = s / sqrt(n)  #求标准误
        t_value = abs(t.ppf(alpha / 2, n - 1))  #求Z
        return sample_mean - se * t_value, sample_mean + se * t_value
    else:  #方差已知
        se = sigma / sqrt  #求标准误
        z_value = abs(norm.ppf(alpha /
                               2))  #求Z,由于取的Z alpha/2默认是返回坐标左边的面积,所以需要取绝对值
        return sample_mean - se * z_value, sample_mean + se * z_value
예제 #21
0
def variance_bias(data):
    """有偏方差(因为除以n)"""
    n=len(data)
    if n<=1:
        return None
    mean_value=mean(data)
    return sum((e-mean_value)**2 for e in data)/n

def sample(num_of_samples,sample_sz,var):
    '''
    返回样本数为num_of_samples,样本容量为sample_sz的方差列表
    '''
    data=[]
    for _ in range(num_of_samples):
        data.append(var([random.uniform(0.0,1.0) for _ in range(sample_sz)]))
    return data

if __name__ == '__main__':
    data1=sample(1000,40,variance_bias)     #有偏方差的情况
    plt.hist(data1,bins="auto",rwidth=0.8)
    plt.axvline(x=mean(data1),c='black')    #基于有偏方差计算出来的均值
    plt.axvline(x=1/12,c='red')             #对于均匀分布来讲(random.uniform),它总体方差的计算公式为(b-a)^2/12
    print("bias: ",mean(data1),1/12)
    plt.show()

    data2 = sample(1000, 40, variance)      #无偏方差的情况
    plt.hist(data2, bins="auto", rwidth=0.8)
    plt.axvline(x=mean(data2), c='black')   #基于无偏方差计算出来的均值
    plt.axvline(x=1 / 12, c='red')          #对于均匀分布来讲(random.uniform),它总体方差的计算公式为(b-a)^2/12
    print("unbias: ", mean(data2), 1 / 12)
    plt.show()
        总体均值未知,求方差的置信空间
        data为传入的样本; alpha为需要传入的置信水平的值
    """
    n = len(data)  #求样本容量
    s2 = variance(data)  #求样本方差

    chi2_lower_value = chi2.ppf(
        alpha / 2, n - 1)  #求坐标左侧Z面积,没错你没看错,因为数学证明的过程中是以右侧为基准的,但是scipy是以左侧为基准的
    chi2_upper_value = chi2.ppf(1 - alpha / 2, n - 1)  #求坐标右侧Z面积
    return (n - 1) * s2 / chi2_upper_value, (n - 1) * s2 / chi2_lower_value


if __name__ == '__main__':
    salary_18 = [1484, 785, 1598, 1366, 1716, 1020, 1716, 785, 3113,
                 1601]  #18岁月收入数据
    salary_35 = [902, 4508, 3809, 3923, 4276, 2065, 1601, 553, 3345,
                 2182]  #35岁月收入数据

    print(mean(salary_18))  #平均月收入的点估计
    print(mean_ci_est(salary_18, 0.05))  #平均月收入的区间估计
    print(mean(salary_35))  #平均月收入的点估计
    print(mean_ci_est(salary_35, 0.05))  #平均月收入的区间估计
    print()
    print(std(salary_18))  #整体方差的点估计开根
    print(variance(salary_18))  #整体方差的点估计(样本方差)
    print(var_ci_est(salary_18, 0.05))  #区间估计
    print(std(salary_35))  #整体方差的点估计开根
    print(variance(salary_35))  #整体方差的点估计(样本方差)
    print(var_ci_est(salary_35, 0.05))  #区间估计
    print()
예제 #23
0
def variance_bias(data):
    """有偏方差"""
    if data is None or len(data) <= 1:
        return None
    n = len(data)
    mean_value = mean(data)
    return sum((e - mean_value)**2 for e in data) / n


def sample(num_of_samples, sample_sz, var):
    """从均匀分布中抽取num_of_samples个样本,每个样本容量sample_sz,返回num_of_samples样本方差"""
    data = []
    for _ in range(num_of_samples):
        data.append(var([random.uniform(0.0, 1.0) for _ in range(sample_sz)]))
    return data


if __name__ == '__main__':
    data1 = sample(1000, 40, variance_bias)
    data2 = sample(1000, 40, variance)

    plt.subplot(121)
    plt.hist(data1, bins="auto", rwidth=0.8)
    plt.axvline(x=mean(data1), c='y')
    plt.axvline(x=1 / 12, c='r')

    plt.subplot(122)
    plt.hist(data2, bins="auto", rwidth=0.8)
    plt.axvline(x=mean(data2), c='y')
    plt.axvline(x=1 / 12, c='r')
    plt.show()
예제 #24
0
from playStats.descriptive_stats import std

if __name__ == '__main__':
    data = [2, 2, 2, 1, 1, 1, 3, 3]

    # 测试频率
    print(frequency(data))

    # 测试众数
    print(mode(data))

    # 测试中位数
    print(median(data))

    # 测试均值
    print(mean(data))

    # 测试极差
    print(rng(data))

    # 测试四分位数
    print(quartile(data))

    # 测试方差
    print(variance(data))

    # 测试标准差
    print(std(data))


예제 #25
0
def sample(num_of_samples, sample_sz):
    data = []
    for _ in range(num_of_samples):
        data.append(mean([random.uniform(0.0,1.0) for _ in range(sample_sz)])) #在0-1的均匀分布中抽取sample_sz数量的个体组成一个样本,取该样本均值
    return data
예제 #26
0
import random
import matplotlib.pyplot as plt
from playStats.descriptive_stats import mean

def sample(num_of_samples, sample_sz):
    data = []
    for _ in range(num_of_samples):
        data.append(mean([random.uniform(0.0,1.0) for _ in range(sample_sz)])) #在0-1的均匀分布中抽取sample_sz数量的个体组成一个样本,取该样本均值
    return data

if __name__ == "__main__":
    data = sample(1000, 40)
    plt.hist(data, bins = 'auto', rwidth = 0.8)
    plt.axvline(x=mean(data),c = 'red') #呈现该组样本均值的均值所对应的垂直线,
    plt.show()
예제 #27
0
def variance_bias(data):
    """有偏方差"""
    n = len(data)
    if n <= 1:
        return None
    mean_value = mean(data)
    return sum((e - mean_value)**2 for e in data) / (n)


def sample(num_of_samples, sample_sz, var):
    data = []
    for _ in range(num_of_samples):
        data.append(var([random.uniform(0.0, 1.0) for _ in range(sample_sz)]))
    return data


if __name__ == "__main__":
    data1 = sample(1000, 40, variance_bias)
    plt.hist(data1, bins="auto", rwidth=0.8)
    plt.axvline(x=mean(data1), c='black')
    plt.axvline(x=1 / 12, c='red')
    print("bias :", mean(data1), 1 / 12)
    plt.show()

    data2 = sample(1000, 40, variance)
    plt.hist(data2, bins="auto", rwidth=0.8)
    plt.axvline(x=mean(data2), c='black')
    plt.axvline(x=1 / 12, c='red')
    print("unbias :", mean(data2), 1 / 12)
    plt.show()
예제 #28
0
import random
from playStats.descriptive_stats import mean, variance
import matplotlib.pyplot as plt

if __name__ == "__main__":
    sample_means = []
    sample_vars = []
    indices = []
    for sz in range(20, 10001, 50):
        indices.append(sz)
        sample = [random.gauss(0.0, 1.0) for _ in range(sz)]
        sample_means.append(mean(sample))
        sample_vars.append(variance(sample))
    plt.plot(indices, sample_means)
    plt.plot(indices, sample_vars)
    plt.show()
예제 #29
0
import random
from playStats.descriptive_stats import mean
from playStats.descriptive_stats import variance
import matplotlib.pyplot as plt

if __name__ == '__main__':

    indices = []
    data_mean = []
    data_varvance = []
    for sample_sz in range(20, 10001, 50):
        indices.append(sample_sz)
        sample = [random.gauss(0.0, 1.0) for _ in range(sample_sz)]
        data_mean.append(mean(sample))
        data_varvance.append(variance(sample))

    plt.plot(indices, data_mean)
    plt.axhline(0, c='r')

    plt.plot(indices, data_varvance)
    plt.axhline(1, c='b')

    plt.show()
예제 #30
0
    if n <= 1:
        return None
    mean_value = mean(data)
    return sum((e - mean_value)**2 for e in data) / n


def sample(num_of_samples, sample_sz, var):
    data = []
    for _ in range(num_of_samples):
        data.append(var([random.uniform(0.0, 1.0) for _ in range(sample_sz)
                         ]))  #在0-1的均匀分布中抽取sample_sz数量的个体组成一个样本,取该样本均值
    return data


if __name__ == "__main__":
    #biased
    data1 = sample(1000, 40, variance_bias)
    plt.hist(data1, bins="auto", rwidth=0.8)
    plt.axvline(x=mean(data1), c="black")
    plt.axvline(x=1 / 12, c="red")  #计算0-1均匀分布的总体的方差:(1-0)**2/12
    print("bias: ", mean(data1), 1 / 12)  #打印实验值和理论值
    plt.show()

    #unbiased
    data2 = sample(1000, 40, variance)
    plt.hist(data2, bins="auto", rwidth=0.8)
    plt.axvline(x=mean(data2), c="black")
    plt.axvline(x=1 / 12, c="red")  # 计算0-1均匀分布的总体的方差:(1-0)**2/12
    print("unbias: ", mean(data2), 1 / 12)  # 打印实验值和理论值
    plt.show()