def test_evaluate_equal_dim_and_num_lt(self): x1 = np.arange(3, 10, 2) x2 = np.arange(3, 8, 2) kde = mlab.GaussianKDE(x1) y_expected = [0.08797252, 0.11774109, 0.11774109] y = kde.evaluate(x2) np.testing.assert_array_almost_equal(y, y_expected, 7)
def test_wrong_bw_method(self): """Test the error message that should be called when bw is invalid.""" np.random.seed(8765678) n_basesample = 50 data = np.random.randn(n_basesample) with pytest.raises(ValueError): mlab.GaussianKDE(data, bw_method="invalid")
def test_kde_integer_input(self): """Regression test for #1181.""" x1 = np.arange(5) kde = mlab.GaussianKDE(x1) y_expected = [0.13480721, 0.18222869, 0.19514935, 0.18222869, 0.13480721] np.testing.assert_array_almost_equal(kde(x1), y_expected, decimal=6)
def test_evaluate_dim_and_num(self): """Tests if evaluated against a one by one array""" x1 = np.arange(3, 10, 2) x2 = np.array([3]) kde = mlab.GaussianKDE(x1) y_expected = [0.08797252] y = kde.evaluate(x2) np.testing.assert_array_almost_equal(y, y_expected, 7)
def fit_density(self, X): if frequency_interpolation_type == 'linear_density': self.model = scipy.interpolate.interp1d(list(Counter(X).keys()), list(Counter(X).values()), fill_value="extrapolate") if self.interpolation_type == 'kde': self.model = mlab.GaussianKDE(X, 'scott')
def test_scalar_covariance_dataset(self): """Test a scalar's cov factor.""" np.random.seed(8765678) n_basesample = 50 multidim_data = [np.random.randn(n_basesample) for i in range(5)] kde = mlab.GaussianKDE(multidim_data, bw_method=0.5) assert kde.covariance_factor() == 0.5
def test_callable_singledim_dataset(self): """Test the callable's cov factor for a single-dimensional array.""" np.random.seed(8765678) n_basesample = 50 multidim_data = np.random.randn(n_basesample) kde = mlab.GaussianKDE(multidim_data, bw_method='silverman') y_expected = 0.48438841363348911 assert_almost_equal(kde.covariance_factor(), y_expected, 7)
def test_callable_covariance_dataset(self): """Test the callable's cov factor for a multi-dimensional array.""" np.random.seed(8765678) n_basesample = 50 multidim_data = [np.random.randn(n_basesample) for i in range(5)] def callable_fun(x): return 0.55 kde = mlab.GaussianKDE(multidim_data, bw_method=callable_fun) assert kde.covariance_factor() == 0.55
def test_kde_bandwidth_method(self): np.random.seed(8765678) n_basesample = 50 xn = np.random.randn(n_basesample) # Default gkde = mlab.GaussianKDE(xn) # Supply a callable gkde2 = mlab.GaussianKDE(xn, 'scott') # Supply a scalar gkde3 = mlab.GaussianKDE(xn, bw_method=gkde.factor) xs = np.linspace(-7, 7, 51) kdepdf = gkde.evaluate(xs) kdepdf2 = gkde2.evaluate(xs) assert kdepdf.all() == kdepdf2.all() kdepdf3 = gkde3.evaluate(xs) assert kdepdf.all() == kdepdf3.all()
def pWH(attributes, players, players_group_name, save_name): ''' plot the attributes vs height, weight and BMI players: players of specific group:DataFrame attributes:list of attributes:list players_group_name: group name of players:str save_name: name of saved graph:str ''' items = attributes ply = players pln = players_group_name itemn = save_name assert isinstance(items, list) assert isinstance(ply, pd.DataFrame) assert isinstance(pln, str) assert isinstance(itemn, str) x = [ply.groupby('Hight (cm)').mean(), ply.groupby('Weight').mean()] l = len(items) fig, axs = subplots(l, 2, figsize=(12, 6 * l)) for j in range(l): item = items[j] for i in range(2): x1 = x[i].index.values y1 = x[i][item].values xv = np.linspace(min(x1), max(x1), 500) fl = interpolate.interp1d(x1, y1) xname = x[i].index.name dx = ply[xname] normal = mlab.GaussianKDE(dx)(xv) normal = normal + max(normal) / 2 if l > 1: ax = axs[j, i] else: ax = axs[i] d = xv[1] - xv[0] for k in range(499): ax.add_patch( patches.Rectangle((xv[k], 0), d, fl(xv[k]), color='b', linewidth=0, alpha=normal[k] / max(normal))) ax.set_xlabel(xname + ' (lb)' * (xname == 'Weight'), fontsize='x-large') ax.set_ylabel(item, fontsize='x-large') if xname == 'Hight (cm)': xname = 'Height' ax.set_title(item + ' vs ' + xname + ' for ' + pln, fontsize='x-large') ax.grid(True) ax.set_ylim([0, 100]) ax.set_xlim([min(xv), max(xv)]) fig.savefig('./graph/' + itemn + ' of ' + pln + '.jpg')
def test_evaluate_inv_dim(self): """ Invert the dimensions; i.e., for a dataset of dimension 1 [3, 2, 4], the points should have a dimension of 3 [[3], [2], [4]]. """ np.random.seed(8765678) n_basesample = 50 multidim_data = np.random.randn(n_basesample) kde = mlab.GaussianKDE(multidim_data) x2 = [[1], [2], [3]] with pytest.raises(ValueError): kde.evaluate(x2)
def test_evaluate_diff_dim(self): """ Test the evaluate method when the dim's of dataset and points have different dimensions. """ x1 = np.arange(3, 10, 2) kde = mlab.GaussianKDE(x1) x2 = np.arange(3, 12, 2) y_expected = [ 0.08797252, 0.11774109, 0.11774109, 0.08797252, 0.0370153 ] y = kde.evaluate(x2) np.testing.assert_array_almost_equal(y, y_expected, 7)
def test_gaussian_kde_covariance_caching(self): x1 = np.array([-7, -5, 1, 4, 5], dtype=float) xs = np.linspace(-10, 10, num=5) # These expected values are from scipy 0.10, before some changes to # gaussian_kde. They were not compared with any external reference. y_expected = [0.02463386, 0.04689208, 0.05395444, 0.05337754, 0.01664475] # set it to the default bandwidth. kde2 = mlab.GaussianKDE(x1, 'scott') y2 = kde2(xs) np.testing.assert_array_almost_equal(y_expected, y2, decimal=7)
def draw_wage_of_top(top_number, max_of_x): ''' draw the wage distribution of top players top_number:number of players:int max_of_x: limit of axis x ''' h = top_number m = max_of_x assert isinstance(h, int) assert isinstance(m, int) item = 'Wage(K)' dph = defend_player.head(h) sph = strike_player.head(h) mph = midfield_player.head(h) gph = goalkeep_player.head(h) dpi = dph[item] spi = sph[item] mpi = mph[item] gpi = gph[item] x = pd.concat([dph, sph, mph, gph])[item] bins = np.linspace(x.min(), x.max(), 10) db = bins[1] - bins[0] x1 = np.linspace(x.min(), x.max(), 100) normals = norm.pdf(x1, spi.mean(), spi.std()) * h * db normalm = norm.pdf(x1, mpi.mean(), mpi.std()) * h * db normald = norm.pdf(x1, dpi.mean(), dpi.std()) * h * db normalg = norm.pdf(x1, gpi.mean(), gpi.std()) * h * db kde = mlab.GaussianKDE(x) plt.hist([sph[item], mph[item], dph[item], mph[item]], bins=bins, rwidth=0.8, edgecolor='k', stacked=True, label=['Striker', 'Midfielder', 'Defender', 'GoalKeeper'], alpha=0.8) plt.plot(x1, normals, label='Striker', linewidth=3, color='b') plt.plot(x1, normalm, label='Midfielder', linewidth=3, color='yellow') plt.plot(x1, normald, label='Defender', linewidth=3, color='lime') plt.plot(x1, normalg, label='GoalKeeper', linewidth=3, color='red') plt.grid(True) plt.xlabel(item, fontsize='x-large') plt.ylabel('Number of players', fontsize='x-large') plt.legend(loc='best') plt.title('Distribution of Wage' + ' of top' + str(h) + ' players', fontsize='x-large') plt.xlim([0, m])
def plot_height_weight_BMI(attribute, players): ''' plot the an attribute vs height, weight and BMI players: specific players:DataFrame item:attribute:str ''' item = attribute ply = players x = [ ply.groupby('Hight (cm)').mean(), ply.groupby('Weight').mean(), ply.groupby('BMI').mean() ] assert isinstance(item, str) assert isinstance(ply, pd.DataFrame) fig, axs = subplots(1, 3, figsize=(20, 5)) for i in range(3): x1 = x[i].index.values y1 = x[i][item].values xv = np.linspace(min(x1), max(x1), 100) fl = interpolate.interp1d(x1, y1) xname = x[i].index.name dx = ply[xname] #normal = norm.pdf(xv, dx.mean(), dx.std()) normal = mlab.GaussianKDE(dx)(xv) normal = normal + max(normal) / 2 ax = axs[i] d = xv[1] - xv[0] for j in range(99): ax.add_patch( patches.Rectangle((xv[j], 0), d, fl(xv[j]), color='b', linewidth=0, alpha=normal[j] / max(normal))) ax.set_xlabel(xname + ' (lb)' * (xname == 'Weight'), fontsize='x-large') ax.set_ylabel(item, fontsize='x-large') if xname == 'Hight (cm)': xname = 'Height' ax.set_title(item + ' vs ' + xname, fontsize='x-large') ax.grid(True) ax.set_ylim([0, 100]) ax.set_xlim([min(xv), max(xv)]) fig.savefig('./graph/' + item + '.jpg')
def test_4(): # 读取数据 data = pd.read_excel( 'C:/Users/zhaozehui/PycharmProjects/DataAnalysis/venv/data/hengxiang/test1.xls' ) datas = data['语文'] print(datas.count()) # 绘图:语文成绩的直方图 plt.hist( datas, # 绘图数据 bins=np.arange(datas.min(), datas.max(), 3), # 指定直方图的条形数为20个 normed=True, # 设置为频率直方图 color='steelblue', # 指定填充色 align='left', edgecolor='k', # 指定直方图的边界色 label='直方图' # 为直方图呈现标签 ) plt.title('班级成绩分布直方图') plt.xlabel('成绩') plt.ylabel('人数') # 生成正态曲线的数据 x1 = np.linspace(datas.min(), datas.max(), 1000) normal = mlab.normpdf(x1, datas.mean(), datas.std()) # 绘制正态分布曲线 line1, = plt.plot(x1, normal, 'r-', linewidth=2) # 生成核密度曲线的数据 kde = mlab.GaussianKDE(datas) x2 = np.linspace(datas.min(), datas.max(), 1000) # 绘制 line2, = plt.plot(x2, kde(x2), 'g-', linewidth=2) # 去除图形顶部边界和右边界的刻度 plt.tick_params(top='off', right='off') # 显示图例 plt.legend([line1, line2], ['正态分布曲线', '核密度曲线'], loc='best') # 显示图形 plt.show() return
def single_attribute_distribution(attribute, unit=''): ''' draw distribution of single attribute attribute(str):the attribute unit(str) ''' item = attribute assert isinstance(item, str) assert isinstance(unit, str) x = player_attributes[item] bins = np.linspace(x.min(), x.max(), 10) x1 = np.linspace(x.min(), x.max(), 100) normal = norm.pdf(x1, x.mean(), x.std()) * x.count() * (bins[1] - bins[0]) kde = mlab.GaussianKDE(x) plt.hist([ defend_player[item], midfield_player[item], strike_player[item], goalkeep_player[item] ], bins=bins, rwidth=0.8, edgecolor='k', stacked=True, label=['Defender', 'Midfielder', 'Striker', 'GoalKeeper']) plt.plot(x1, kde(x1) * x.count() * (bins[1] - bins[0]), linewidth=3, label='Kernel density') plt.plot(x1, normal, label='Normal distribution', linewidth=3) plt.grid(True) plt.xlabel(item + unit, fontsize='x-large') plt.ylabel('Number of players', fontsize='x-large') plt.legend(loc='best') if item == 'Hight (cm)': item = 'Height' plt.title('Distribution of ' + item, fontsize='x-large') plt.savefig('./graph/' + 'Distribution of ' + item + '.jpg')
import numpy as np import pandas as pd import matplotlib.pyplot as plt import matplotlib.mlab as mlab plt.rcParams['font.sans-serif'] = ['Microsoft YaHei'] plt.rcParams['axes.unicode_minus'] = False titanic = pd.read_csv('birthrate.csv') titanic.dropna(subset=['2008'], inplace=True) plt.style.use('ggplot') plt.hist(titanic['2008'], bins=np.arange(titanic['2008'].min(), titanic['2008'].max(), 3), normed=True, color='steelblue', edgecolor='k') plt.title('2008出生直方图和密度图') plt.xlabel('出生率(‰)') plt.ylabel('频率(‰)') kde = mlab.GaussianKDE(titanic['2008']) x2 = np.linspace(titanic['2008'].min(), titanic['2008'].max(), 1000) line2 = plt.plot(x2, kde(x2), 'g-', linewidth=2) plt.tick_params(top='off', right='off') plt.show()
def test_scott_multidim_dataset(self): """Test scott's output for a multi-dimensional array.""" x1 = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) with pytest.raises(np.linalg.LinAlgError): mlab.GaussianKDE(x1, "scott")
def test_no_data(self): """Pass no data into the GaussianKDE class.""" with pytest.raises(ValueError): mlab.GaussianKDE([])
def _kde_method(X, coords): if np.all(X[0] == X): return (X[0] == coords).astype(float) kde = mlab.GaussianKDE(X, bw_method) return kde.evaluate(coords)
resid, # 绘图数据 bins=100, # 指定直方图条的个数 normed=True, # 设置为频率直方图 color='steelblue', # 指定填充色 edgecolor='k') # 指定直方图的边界色 # 设置坐标轴标签和标题 plt.title('残差直方图') plt.ylabel('密度值') # 生成正态曲线的数据 x1 = np.linspace(resid.min(), resid.max(), 1000) normal = mlab.normpdf(x1, resid.mean(), resid.std()) # 绘制正态分布曲线 plt.plot(x1, normal, 'r-', linewidth=2, label='正态分布曲线') # 生成核密度曲线的数据 kde = mlab.GaussianKDE(resid) x2 = np.linspace(resid.min(), resid.max(), 1000) # 绘制核密度曲线 plt.plot(x2, kde(x2), 'k-', linewidth=2, label='核密度曲线') # 去除图形顶部边界和右边界的刻度 plt.tick_params(top='off', right='off') # 显示图例 plt.legend(loc='best') # 显示图形 plt.show() #<><>><><><><><><><><><><><><><><><> # 残差的正态性检验(PP图和QQ图法) pp_qq_plot = sm.ProbPlot(resid) pp_qq_plot.ppplot(line='45') plt.title('P-P图') pp_qq_plot.qqplot(line='q')
if k % 2 ==0: # 25没有数据会有异常 try: data_dic[k-1] = data_dic[k-1] + g_list except KeyError as e: data_dic[int(str(e))]=g_list # 如果为奇数插入字典 else: data_dic[k] = g_list # 格式化输出成茎叶图 print('黄彩思1704010135') # 遍历字典 x = [] for k,v in data_dic.items(): a = '' # 循环输出列表数据并拼接成字符串 for i in v: a = a + ' ' + str(i) # 格式化输出 # print(str(k).rjust(5), a) x.append(k) x.append(57) plt.hist(last_data, bins=x, edgecolor='k', normed=True) kde = mlab.GaussianKDE(last_data) x2 = np.linspace(min(last_data), max(last_data), 1000) plt.plot(x2, kde(x2), 'g-', linewidth=2) plt.xlabel('出生率') plt.ylabel('频率') plt.show()
ax.set_ylabel('Probability density') ax.set_xlabel('Similarity(The higher the more similar)') ax.set_title('Histogram of Similarity') plt.legend() plt.show() fig = plt.figure() ax = fig.add_subplot(111) ax.hist(same, bins=bi, normed=True, alpha=0.8, label="Same person") ax.hist(diff, bins=bi, normed=True, alpha=0.8, label="Different person") ax.set_ylabel('Probability density') ax.set_xlabel('Similarity(The higher the more similar)') ax.set_title('Histogram of Similarity') x1 = np.linspace(min(diff), max(diff), 1000) normal = mlab.normpdf(x1, np.mean(diff), np.std(diff)) line1, = plt.plot(x1, normal, 'r-', linewidth=2) kde = mlab.GaussianKDE(diff) x2 = np.linspace(min(diff), max(diff), 1000) line2, = plt.plot(x2, kde(x2), 'g-', linewidth=2) plt.legend( [line1, line2], ['normal', 'gussiankde'], ) x3 = np.linspace(min(same), max(same), 1000) normal = mlab.normpdf(x3, np.mean(same), np.std(same)) line3, = plt.plot(x3, normal, 'r-', linewidth=2) kde = mlab.GaussianKDE(same) x4 = np.linspace(min(same), max(same), 1000) line4, = plt.plot(x4, kde(x4), 'g-', linewidth=2) plt.legend([line1, line2], ['normal', 'gussiankde'], loc="best") plt.show()
def test_evaluate_point_dim_not_one(self): x1 = np.arange(3, 10, 2) x2 = [np.arange(3, 10, 2), np.arange(3, 10, 2)] kde = mlab.GaussianKDE(x1) with pytest.raises(ValueError): kde.evaluate(x2)
def test_scott_singledim_dataset(self): """Test scott's output a single-dimensional array.""" x1 = np.array([-7, -5, 1, 4, 5]) mygauss = mlab.GaussianKDE(x1, "scott") y_expected = 0.72477966367769553 assert_almost_equal(mygauss.covariance_factor(), y_expected, 7)
def _kde_method(X, coords): # fallback gracefully if the vector contains only one value if np.all(X[0] == X): return (X[0] == coords).astype(float) kde = mlab.GaussianKDE(X, bw_method) return kde.evaluate(coords)
def test_scalar_empty_dataset(self): """Test the scalar's cov factor for an empty array.""" with pytest.raises(ValueError): mlab.GaussianKDE([], bw_method=5)
def test_single_dataset_element(self): """Pass a single dataset element into the GaussianKDE class.""" with pytest.raises(ValueError): mlab.GaussianKDE([42])
def test_silverman_singledim_dataset(self): """Test silverman's output for a single dimension list.""" x1 = np.array([-7, -5, 1, 4, 5]) mygauss = mlab.GaussianKDE(x1, "silverman") y_expected = 0.76770389927475502 assert_almost_equal(mygauss.covariance_factor(), y_expected, 7)