示例#1
0
 def test_evaluate_equal_dim_and_num_lt(self):
     x1 = np.arange(3, 10, 2)
     x2 = np.arange(3, 8, 2)
     kde = mlab.GaussianKDE(x1)
     y_expected = [0.08797252, 0.11774109, 0.11774109]
     y = kde.evaluate(x2)
     np.testing.assert_array_almost_equal(y, y_expected, 7)
示例#2
0
 def test_wrong_bw_method(self):
     """Test the error message that should be called when bw is invalid."""
     np.random.seed(8765678)
     n_basesample = 50
     data = np.random.randn(n_basesample)
     with pytest.raises(ValueError):
         mlab.GaussianKDE(data, bw_method="invalid")
示例#3
0
 def test_kde_integer_input(self):
     """Regression test for #1181."""
     x1 = np.arange(5)
     kde = mlab.GaussianKDE(x1)
     y_expected = [0.13480721, 0.18222869, 0.19514935, 0.18222869,
                   0.13480721]
     np.testing.assert_array_almost_equal(kde(x1), y_expected, decimal=6)
示例#4
0
 def test_evaluate_dim_and_num(self):
     """Tests if evaluated against a one by one array"""
     x1 = np.arange(3, 10, 2)
     x2 = np.array([3])
     kde = mlab.GaussianKDE(x1)
     y_expected = [0.08797252]
     y = kde.evaluate(x2)
     np.testing.assert_array_almost_equal(y, y_expected, 7)
示例#5
0
    def fit_density(self, X):
        if frequency_interpolation_type == 'linear_density':
            self.model = scipy.interpolate.interp1d(list(Counter(X).keys()),
                                                    list(Counter(X).values()),
                                                    fill_value="extrapolate")

        if self.interpolation_type == 'kde':
            self.model = mlab.GaussianKDE(X, 'scott')
示例#6
0
    def test_scalar_covariance_dataset(self):
        """Test a scalar's cov factor."""
        np.random.seed(8765678)
        n_basesample = 50
        multidim_data = [np.random.randn(n_basesample) for i in range(5)]

        kde = mlab.GaussianKDE(multidim_data, bw_method=0.5)
        assert kde.covariance_factor() == 0.5
示例#7
0
    def test_callable_singledim_dataset(self):
        """Test the callable's cov factor for a single-dimensional array."""
        np.random.seed(8765678)
        n_basesample = 50
        multidim_data = np.random.randn(n_basesample)

        kde = mlab.GaussianKDE(multidim_data, bw_method='silverman')
        y_expected = 0.48438841363348911
        assert_almost_equal(kde.covariance_factor(), y_expected, 7)
示例#8
0
    def test_callable_covariance_dataset(self):
        """Test the callable's cov factor for a multi-dimensional array."""
        np.random.seed(8765678)
        n_basesample = 50
        multidim_data = [np.random.randn(n_basesample) for i in range(5)]

        def callable_fun(x):
            return 0.55
        kde = mlab.GaussianKDE(multidim_data, bw_method=callable_fun)
        assert kde.covariance_factor() == 0.55
示例#9
0
    def test_kde_bandwidth_method(self):

        np.random.seed(8765678)
        n_basesample = 50
        xn = np.random.randn(n_basesample)

        # Default
        gkde = mlab.GaussianKDE(xn)
        # Supply a callable
        gkde2 = mlab.GaussianKDE(xn, 'scott')
        # Supply a scalar
        gkde3 = mlab.GaussianKDE(xn, bw_method=gkde.factor)

        xs = np.linspace(-7, 7, 51)
        kdepdf = gkde.evaluate(xs)
        kdepdf2 = gkde2.evaluate(xs)
        assert kdepdf.all() == kdepdf2.all()
        kdepdf3 = gkde3.evaluate(xs)
        assert kdepdf.all() == kdepdf3.all()
示例#10
0
def pWH(attributes, players, players_group_name, save_name):
    '''
    plot the attributes vs height, weight and BMI
    players: players of specific group:DataFrame
    attributes:list of attributes:list
    players_group_name: group name of players:str
    save_name: name of saved graph:str
    '''
    items = attributes
    ply = players
    pln = players_group_name
    itemn = save_name
    assert isinstance(items, list)
    assert isinstance(ply, pd.DataFrame)
    assert isinstance(pln, str)
    assert isinstance(itemn, str)
    x = [ply.groupby('Hight (cm)').mean(), ply.groupby('Weight').mean()]
    l = len(items)
    fig, axs = subplots(l, 2, figsize=(12, 6 * l))
    for j in range(l):
        item = items[j]
        for i in range(2):
            x1 = x[i].index.values
            y1 = x[i][item].values
            xv = np.linspace(min(x1), max(x1), 500)
            fl = interpolate.interp1d(x1, y1)
            xname = x[i].index.name
            dx = ply[xname]
            normal = mlab.GaussianKDE(dx)(xv)
            normal = normal + max(normal) / 2
            if l > 1:
                ax = axs[j, i]
            else:
                ax = axs[i]
            d = xv[1] - xv[0]
            for k in range(499):
                ax.add_patch(
                    patches.Rectangle((xv[k], 0),
                                      d,
                                      fl(xv[k]),
                                      color='b',
                                      linewidth=0,
                                      alpha=normal[k] / max(normal)))
            ax.set_xlabel(xname + ' (lb)' * (xname == 'Weight'),
                          fontsize='x-large')
            ax.set_ylabel(item, fontsize='x-large')
            if xname == 'Hight (cm)':
                xname = 'Height'
            ax.set_title(item + ' vs ' + xname + ' for ' + pln,
                         fontsize='x-large')
            ax.grid(True)
            ax.set_ylim([0, 100])
            ax.set_xlim([min(xv), max(xv)])
    fig.savefig('./graph/' + itemn + ' of ' + pln + '.jpg')
示例#11
0
 def test_evaluate_inv_dim(self):
     """
     Invert the dimensions; i.e., for a dataset of dimension 1 [3, 2, 4],
     the points should have a dimension of 3 [[3], [2], [4]].
     """
     np.random.seed(8765678)
     n_basesample = 50
     multidim_data = np.random.randn(n_basesample)
     kde = mlab.GaussianKDE(multidim_data)
     x2 = [[1], [2], [3]]
     with pytest.raises(ValueError):
         kde.evaluate(x2)
示例#12
0
 def test_evaluate_diff_dim(self):
     """
     Test the evaluate method when the dim's of dataset and points have
     different dimensions.
     """
     x1 = np.arange(3, 10, 2)
     kde = mlab.GaussianKDE(x1)
     x2 = np.arange(3, 12, 2)
     y_expected = [
         0.08797252, 0.11774109, 0.11774109, 0.08797252, 0.0370153
     ]
     y = kde.evaluate(x2)
     np.testing.assert_array_almost_equal(y, y_expected, 7)
示例#13
0
    def test_gaussian_kde_covariance_caching(self):
        x1 = np.array([-7, -5, 1, 4, 5], dtype=float)
        xs = np.linspace(-10, 10, num=5)
        # These expected values are from scipy 0.10, before some changes to
        # gaussian_kde. They were not compared with any external reference.
        y_expected = [0.02463386, 0.04689208, 0.05395444, 0.05337754,
                      0.01664475]

        # set it to the default bandwidth.
        kde2 = mlab.GaussianKDE(x1, 'scott')
        y2 = kde2(xs)

        np.testing.assert_array_almost_equal(y_expected, y2, decimal=7)
示例#14
0
def draw_wage_of_top(top_number, max_of_x):
    '''
    draw the wage distribution of top players
    top_number:number of players:int
    max_of_x: limit of axis x
    '''
    h = top_number
    m = max_of_x
    assert isinstance(h, int)
    assert isinstance(m, int)
    item = 'Wage(K)'
    dph = defend_player.head(h)
    sph = strike_player.head(h)
    mph = midfield_player.head(h)
    gph = goalkeep_player.head(h)
    dpi = dph[item]
    spi = sph[item]
    mpi = mph[item]
    gpi = gph[item]
    x = pd.concat([dph, sph, mph, gph])[item]
    bins = np.linspace(x.min(), x.max(), 10)
    db = bins[1] - bins[0]
    x1 = np.linspace(x.min(), x.max(), 100)
    normals = norm.pdf(x1, spi.mean(), spi.std()) * h * db
    normalm = norm.pdf(x1, mpi.mean(), mpi.std()) * h * db
    normald = norm.pdf(x1, dpi.mean(), dpi.std()) * h * db
    normalg = norm.pdf(x1, gpi.mean(), gpi.std()) * h * db
    kde = mlab.GaussianKDE(x)
    plt.hist([sph[item], mph[item], dph[item], mph[item]],
             bins=bins,
             rwidth=0.8,
             edgecolor='k',
             stacked=True,
             label=['Striker', 'Midfielder', 'Defender', 'GoalKeeper'],
             alpha=0.8)
    plt.plot(x1, normals, label='Striker', linewidth=3, color='b')
    plt.plot(x1, normalm, label='Midfielder', linewidth=3, color='yellow')
    plt.plot(x1, normald, label='Defender', linewidth=3, color='lime')
    plt.plot(x1, normalg, label='GoalKeeper', linewidth=3, color='red')
    plt.grid(True)
    plt.xlabel(item, fontsize='x-large')
    plt.ylabel('Number of players', fontsize='x-large')
    plt.legend(loc='best')
    plt.title('Distribution of Wage' + ' of top' + str(h) + ' players',
              fontsize='x-large')
    plt.xlim([0, m])
示例#15
0
def plot_height_weight_BMI(attribute, players):
    '''
    plot the an attribute vs height, weight and BMI
    players: specific players:DataFrame
    item:attribute:str
    '''
    item = attribute
    ply = players
    x = [
        ply.groupby('Hight (cm)').mean(),
        ply.groupby('Weight').mean(),
        ply.groupby('BMI').mean()
    ]
    assert isinstance(item, str)
    assert isinstance(ply, pd.DataFrame)
    fig, axs = subplots(1, 3, figsize=(20, 5))
    for i in range(3):
        x1 = x[i].index.values
        y1 = x[i][item].values
        xv = np.linspace(min(x1), max(x1), 100)
        fl = interpolate.interp1d(x1, y1)
        xname = x[i].index.name
        dx = ply[xname]
        #normal = norm.pdf(xv, dx.mean(), dx.std())
        normal = mlab.GaussianKDE(dx)(xv)
        normal = normal + max(normal) / 2
        ax = axs[i]
        d = xv[1] - xv[0]
        for j in range(99):
            ax.add_patch(
                patches.Rectangle((xv[j], 0),
                                  d,
                                  fl(xv[j]),
                                  color='b',
                                  linewidth=0,
                                  alpha=normal[j] / max(normal)))
        ax.set_xlabel(xname + ' (lb)' * (xname == 'Weight'),
                      fontsize='x-large')
        ax.set_ylabel(item, fontsize='x-large')
        if xname == 'Hight (cm)':
            xname = 'Height'
        ax.set_title(item + ' vs ' + xname, fontsize='x-large')
        ax.grid(True)
        ax.set_ylim([0, 100])
        ax.set_xlim([min(xv), max(xv)])
    fig.savefig('./graph/' + item + '.jpg')
示例#16
0
def test_4():
    # 读取数据
    data = pd.read_excel(
        'C:/Users/zhaozehui/PycharmProjects/DataAnalysis/venv/data/hengxiang/test1.xls'
    )

    datas = data['语文']
    print(datas.count())
    # 绘图:语文成绩的直方图
    plt.hist(
        datas,  # 绘图数据
        bins=np.arange(datas.min(), datas.max(), 3),  # 指定直方图的条形数为20个
        normed=True,  # 设置为频率直方图
        color='steelblue',  # 指定填充色
        align='left',
        edgecolor='k',  # 指定直方图的边界色
        label='直方图'  # 为直方图呈现标签
    )

    plt.title('班级成绩分布直方图')
    plt.xlabel('成绩')
    plt.ylabel('人数')

    # 生成正态曲线的数据
    x1 = np.linspace(datas.min(), datas.max(), 1000)
    normal = mlab.normpdf(x1, datas.mean(), datas.std())
    # 绘制正态分布曲线
    line1, = plt.plot(x1, normal, 'r-', linewidth=2)

    # 生成核密度曲线的数据
    kde = mlab.GaussianKDE(datas)
    x2 = np.linspace(datas.min(), datas.max(), 1000)

    # 绘制
    line2, = plt.plot(x2, kde(x2), 'g-', linewidth=2)

    # 去除图形顶部边界和右边界的刻度
    plt.tick_params(top='off', right='off')

    # 显示图例
    plt.legend([line1, line2], ['正态分布曲线', '核密度曲线'], loc='best')

    # 显示图形
    plt.show()
    return
示例#17
0
def single_attribute_distribution(attribute, unit=''):
    '''
    draw distribution of  single attribute
    attribute(str):the attribute
    unit(str)
    '''
    item = attribute
    assert isinstance(item, str)
    assert isinstance(unit, str)
    x = player_attributes[item]
    bins = np.linspace(x.min(), x.max(), 10)
    x1 = np.linspace(x.min(), x.max(), 100)
    normal = norm.pdf(x1, x.mean(), x.std()) * x.count() * (bins[1] - bins[0])
    kde = mlab.GaussianKDE(x)
    plt.hist([
        defend_player[item], midfield_player[item], strike_player[item],
        goalkeep_player[item]
    ],
             bins=bins,
             rwidth=0.8,
             edgecolor='k',
             stacked=True,
             label=['Defender', 'Midfielder', 'Striker', 'GoalKeeper'])
    plt.plot(x1,
             kde(x1) * x.count() * (bins[1] - bins[0]),
             linewidth=3,
             label='Kernel density')
    plt.plot(x1, normal, label='Normal distribution', linewidth=3)
    plt.grid(True)
    plt.xlabel(item + unit, fontsize='x-large')
    plt.ylabel('Number of players', fontsize='x-large')
    plt.legend(loc='best')
    if item == 'Hight (cm)':
        item = 'Height'
    plt.title('Distribution of ' + item, fontsize='x-large')
    plt.savefig('./graph/' + 'Distribution of ' + item + '.jpg')
示例#18
0
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.mlab as mlab

plt.rcParams['font.sans-serif'] = ['Microsoft YaHei']
plt.rcParams['axes.unicode_minus'] = False
titanic = pd.read_csv('birthrate.csv')
titanic.dropna(subset=['2008'], inplace=True)

plt.style.use('ggplot')
plt.hist(titanic['2008'],
         bins=np.arange(titanic['2008'].min(), titanic['2008'].max(), 3),
         normed=True,
         color='steelblue',
         edgecolor='k')
plt.title('2008出生直方图和密度图')
plt.xlabel('出生率(‰)')
plt.ylabel('频率(‰)')
kde = mlab.GaussianKDE(titanic['2008'])
x2 = np.linspace(titanic['2008'].min(), titanic['2008'].max(), 1000)
line2 = plt.plot(x2, kde(x2), 'g-', linewidth=2)
plt.tick_params(top='off', right='off')
plt.show()
示例#19
0
 def test_scott_multidim_dataset(self):
     """Test scott's output for a multi-dimensional array."""
     x1 = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
     with pytest.raises(np.linalg.LinAlgError):
         mlab.GaussianKDE(x1, "scott")
示例#20
0
 def test_no_data(self):
     """Pass no data into the GaussianKDE class."""
     with pytest.raises(ValueError):
         mlab.GaussianKDE([])
示例#21
0
 def _kde_method(X, coords):
     if np.all(X[0] == X):
         return (X[0] == coords).astype(float)
     kde = mlab.GaussianKDE(X, bw_method)
     return kde.evaluate(coords)
示例#22
0
    resid,  # 绘图数据
    bins=100,  # 指定直方图条的个数
    normed=True,  # 设置为频率直方图
    color='steelblue',  # 指定填充色
    edgecolor='k')  # 指定直方图的边界色

# 设置坐标轴标签和标题
plt.title('残差直方图')
plt.ylabel('密度值')
# 生成正态曲线的数据
x1 = np.linspace(resid.min(), resid.max(), 1000)
normal = mlab.normpdf(x1, resid.mean(), resid.std())
# 绘制正态分布曲线
plt.plot(x1, normal, 'r-', linewidth=2, label='正态分布曲线')
# 生成核密度曲线的数据
kde = mlab.GaussianKDE(resid)
x2 = np.linspace(resid.min(), resid.max(), 1000)
# 绘制核密度曲线
plt.plot(x2, kde(x2), 'k-', linewidth=2, label='核密度曲线')
# 去除图形顶部边界和右边界的刻度
plt.tick_params(top='off', right='off')
# 显示图例
plt.legend(loc='best')
# 显示图形
plt.show()
#<><>><><><><><><><><><><><><><><><>
# 残差的正态性检验(PP图和QQ图法)
pp_qq_plot = sm.ProbPlot(resid)
pp_qq_plot.ppplot(line='45')
plt.title('P-P图')
pp_qq_plot.qqplot(line='q')
示例#23
0
    if k % 2 ==0:
        # 25没有数据会有异常
        try:
            data_dic[k-1] = data_dic[k-1] + g_list
        except KeyError as e:
            data_dic[int(str(e))]=g_list
    # 如果为奇数插入字典
    else:
        data_dic[k] = g_list
# 格式化输出成茎叶图
print('黄彩思1704010135')
# 遍历字典
x = []
for k,v in data_dic.items():
    a = ''
    # 循环输出列表数据并拼接成字符串
    for i in v:
        a = a + ' ' + str(i)
    # 格式化输出
    # print(str(k).rjust(5), a)
    x.append(k)
x.append(57)

plt.hist(last_data, bins=x, edgecolor='k', normed=True)
kde = mlab.GaussianKDE(last_data)
x2 = np.linspace(min(last_data), max(last_data), 1000)
plt.plot(x2, kde(x2), 'g-', linewidth=2)
plt.xlabel('出生率')
plt.ylabel('频率')
plt.show()
示例#24
0
    ax.set_ylabel('Probability density')
    ax.set_xlabel('Similarity(The higher the more similar)')
    ax.set_title('Histogram of Similarity')
    plt.legend()
    plt.show()
    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.hist(same, bins=bi, normed=True, alpha=0.8, label="Same person")
    ax.hist(diff, bins=bi, normed=True, alpha=0.8, label="Different person")
    ax.set_ylabel('Probability density')
    ax.set_xlabel('Similarity(The higher the more similar)')
    ax.set_title('Histogram of Similarity')
    x1 = np.linspace(min(diff), max(diff), 1000)
    normal = mlab.normpdf(x1, np.mean(diff), np.std(diff))
    line1, = plt.plot(x1, normal, 'r-', linewidth=2)
    kde = mlab.GaussianKDE(diff)
    x2 = np.linspace(min(diff), max(diff), 1000)
    line2, = plt.plot(x2, kde(x2), 'g-', linewidth=2)
    plt.legend(
        [line1, line2],
        ['normal', 'gussiankde'],
    )
    x3 = np.linspace(min(same), max(same), 1000)
    normal = mlab.normpdf(x3, np.mean(same), np.std(same))
    line3, = plt.plot(x3, normal, 'r-', linewidth=2)
    kde = mlab.GaussianKDE(same)
    x4 = np.linspace(min(same), max(same), 1000)
    line4, = plt.plot(x4, kde(x4), 'g-', linewidth=2)
    plt.legend([line1, line2], ['normal', 'gussiankde'], loc="best")
    plt.show()
示例#25
0
 def test_evaluate_point_dim_not_one(self):
     x1 = np.arange(3, 10, 2)
     x2 = [np.arange(3, 10, 2), np.arange(3, 10, 2)]
     kde = mlab.GaussianKDE(x1)
     with pytest.raises(ValueError):
         kde.evaluate(x2)
示例#26
0
 def test_scott_singledim_dataset(self):
     """Test scott's output a single-dimensional array."""
     x1 = np.array([-7, -5, 1, 4, 5])
     mygauss = mlab.GaussianKDE(x1, "scott")
     y_expected = 0.72477966367769553
     assert_almost_equal(mygauss.covariance_factor(), y_expected, 7)
示例#27
0
 def _kde_method(X, coords):
     # fallback gracefully if the vector contains only one value
     if np.all(X[0] == X):
         return (X[0] == coords).astype(float)
     kde = mlab.GaussianKDE(X, bw_method)
     return kde.evaluate(coords)
示例#28
0
 def test_scalar_empty_dataset(self):
     """Test the scalar's cov factor for an empty array."""
     with pytest.raises(ValueError):
         mlab.GaussianKDE([], bw_method=5)
示例#29
0
 def test_single_dataset_element(self):
     """Pass a single dataset element into the GaussianKDE class."""
     with pytest.raises(ValueError):
         mlab.GaussianKDE([42])
示例#30
0
 def test_silverman_singledim_dataset(self):
     """Test silverman's output for a single dimension list."""
     x1 = np.array([-7, -5, 1, 4, 5])
     mygauss = mlab.GaussianKDE(x1, "silverman")
     y_expected = 0.76770389927475502
     assert_almost_equal(mygauss.covariance_factor(), y_expected, 7)