def test_uni_kde(): samples = mixture_rvs([.25, .75], size=10000, dist=[stats.norm, stats.norm], kwargs=( dict(loc=-1, scale=.5), dict(loc=1, scale=.5))) bandwidth = approx_bandwidth(samples) # Run statsmodel for reference kde = sm.nonparametric.KDEUnivariate(samples) kde.fit(kernel="gau") # Reuse statsmodel support for our test support = kde.support # Run custom KDE pdf = np.zeros_like(support) uni_kde_seq(support, samples, bandwidth, pdf) # Check value expect = kde.density got = pdf rms = calc_rms(expect, got, norm=True) assert rms < 1e-2, "RMS error too high: {0}".format(rms)
def test_uni_kde(): samples = mixture_rvs([.25, .75], size=10000, dist=[stats.norm, stats.norm], kwargs=(dict(loc=-1, scale=.5), dict(loc=1, scale=.5))) bandwidth = approx_bandwidth(samples) # Run statsmodel for reference kde = sm.nonparametric.KDEUnivariate(samples) kde.fit(kernel="gau") # Reuse statsmodel support for our test support = kde.support # Run custom KDE pdf = np.zeros_like(support) uni_kde_seq(support, samples, bandwidth, pdf) # Check value expect = kde.density got = pdf rms = calc_rms(expect, got, norm=True) assert rms < 1e-2, "RMS error too high: {0}".format(rms)
def test_hsa_uni_kde_ver2(): np.random.seed(12345) samples = mixture_rvs( [0.25, 0.75], size=10000, dist=[stats.norm, stats.norm], kwargs=(dict(loc=-1, scale=0.5), dict(loc=1, scale=0.5)), ) bandwidth = approx_bandwidth(samples) # Run statsmodel for reference kde = sm.nonparametric.KDEUnivariate(samples) kde.fit(kernel="gau", fft=False) # Reuse statsmodel support for our test support = kde.support # Run custom KDE pdf = np.zeros_like(support) hsa_uni_kde_ver2(support, samples, bandwidth, pdf) # Check value expect = kde.density got = pdf rms = calc_rms(expect, got, norm=True) print("RMS", rms) assert rms < 1e-2, "RMS error too high: {0}".format(rms)
def bimodal_samples(n): """ Samples from a bimodal distribution The script in this function is taken from https://www.statsmodels.org/stable/examples/notebooks/generated/kernel_density.html """ # Location, scale and weight for the two distributions dist1_loc, dist1_scale, weight1 = -1, .4, .3 dist2_loc, dist2_scale, weight2 = 1, .5, .7 # Sample from a mixture of distributions f = mixture_rvs(prob=[weight1, weight2], size=n, dist=[stats.norm, stats.norm], kwargs=(dict(loc=dist1_loc, scale=dist1_scale), dict(loc=dist2_loc, scale=dist2_scale))) return f
def plot_uni_test(): samples = mixture_rvs([.25, .75], size=10000, dist=[stats.norm, stats.norm], kwargs=(dict(loc=-1, scale=.5), dict(loc=1, scale=.5))) bandwidth = approx_bandwidth(samples) print('bandwidth', bandwidth) print('size', samples.size) support = build_support(samples, bandwidth) pdf = np.zeros_like(support) uni_kde_seq(support, samples, bandwidth, pdf) # Plotting output_file("kde.html") p1 = figure(title="Hist") hist, edges = np.histogram(samples, bins=50, density=True) p1.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:]) p2 = figure(title="KDE") p2.line(support, pdf) p2.circle(x=support, y=pdf, size=5) p3 = figure(title="KDE-SM") kde = sm.nonparametric.KDEUnivariate(samples) kde.fit(kernel="gau") p3.line(kde.support, kde.density) print(samples.size) print(len(kde.support), len(kde.density)) print(kde.density.sum()) show(column(p1, p2, p3))
def plot_uni_test(): samples = mixture_rvs([.25, .75], size=10000, dist=[stats.norm, stats.norm], kwargs=( dict(loc=-1, scale=.5), dict(loc=1, scale=.5))) bandwidth = approx_bandwidth(samples) print('bandwidth', bandwidth) print('size', samples.size) support = build_support(samples, bandwidth) pdf = np.zeros_like(support) uni_kde_seq(support, samples, bandwidth, pdf) # Plotting output_file("kde.html") p1 = figure(title="Hist") hist, edges = np.histogram(samples, bins=50, density=True) p1.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:]) p2 = figure(title="KDE") p2.line(support, pdf) p2.circle(x=support, y=pdf, size=5) p3 = figure(title="KDE-SM") kde = sm.nonparametric.KDEUnivariate(samples) kde.fit(kernel="gau") p3.line(kde.support, kde.density) print(samples.size) print(len(kde.support), len(kde.density)) print(kde.density.sum()) show(vplot(p1, p2, p3))
def driver(imp_dict, retry=3, size=10000): print("Running univariate kde benchmark on size = {size}".format( size=size)) samples = mixture_rvs([.25, .75], size=size, dist=[stats.norm, stats.norm], kwargs=( dict(loc=-1, scale=.5), dict(loc=1, scale=.5))) bandwidth = approx_bandwidth(samples) # Run statsmodel for reference kde = sm.nonparametric.KDEUnivariate(samples) kde.fit(kernel="gau", fft=False) # Reuse statsmodel support for our test support = kde.support expect = kde.density timing = OrderedDict() # Run timing loop for name, imp in imp_dict.items(): print("Running {name}".format(name=name)) times = [] for t in range(retry): print(" trial = {t}".format(t=t), end=' ... ') got, elapsed = imp(support, samples, bandwidth, timer) print("elapsed =", elapsed, end=' | ') times.append(elapsed) rms = calc_rms(expect, got, norm=True) print("RMS =", rms) if rms > 0.01: print("*** warning, RMS is too high") timing[name] = times return timing
def driver(imp_dict, retry=3, size=10000): print("Running univariate kde benchmark on size = {size}".format(size=size)) samples = mixture_rvs( [0.25, 0.75], size=size, dist=[stats.norm, stats.norm], kwargs=(dict(loc=-1, scale=0.5), dict(loc=1, scale=0.5)), ) bandwidth = approx_bandwidth(samples) # Run statsmodel for reference kde = sm.nonparametric.KDEUnivariate(samples) kde.fit(kernel="gau", fft=False) # Reuse statsmodel support for our test support = kde.support expect = kde.density timing = OrderedDict() # Run timing loop for name, imp in imp_dict.items(): print("Running {name}".format(name=name)) times = [] for t in range(retry): print(" trial = {t}".format(t=t), end=" ... ") got, elapsed = imp(support, samples, bandwidth, timer) print("elapsed =", elapsed, end=" | ") times.append(elapsed) rms = calc_rms(expect, got, norm=True) print("RMS =", rms) if rms > 0.01: print("*** warning, RMS is too high") timing[name] = times return timing
Performance of normal reference plug-in estimator vs silverman. Sample is drawn from a mixture of gaussians. Distribution has been chosen to be reasoanbly close to normal. """ from __future__ import print_function import numpy as np from scipy import stats import matplotlib.pyplot as plt import statsmodels.nonparametric.api as npar from statsmodels.sandbox.nonparametric import kernels from statsmodels.distributions.mixture_rvs import mixture_rvs # example from test_kde.py mixture of two normal distributions np.random.seed(12345) x = mixture_rvs([.1, .9], size=200, dist=[stats.norm, stats.norm], kwargs=(dict(loc=0, scale=.5), dict(loc=1, scale=.5))) kde = npar.KDEUnivariate(x) kernel_names = ['Gaussian', 'Epanechnikov', 'Biweight', 'Triangular', 'Triweight', 'Cosine' ] kernel_switch = ['gau', 'epa', 'tri', 'biw', 'triw', 'cos' ] def true_pdf(x): pdf = 0.1 * stats.norm.pdf(x, loc=0, scale=0.5)
# ## A univariate example np.random.seed( 12345) # Seed the random number generator for reproducible results # We create a bimodal distribution: a mixture of two normal distributions # with locations at `-1` and `1`. # Location, scale and weight for the two distributions dist1_loc, dist1_scale, weight1 = -1, .5, .25 dist2_loc, dist2_scale, weight2 = 1, .5, .75 # Sample from a mixture of distributions obs_dist = mixture_rvs(prob=[weight1, weight2], size=250, dist=[stats.norm, stats.norm], kwargs=(dict(loc=dist1_loc, scale=dist1_scale), dict(loc=dist2_loc, scale=dist2_scale))) # The simplest non-parametric technique for density estimation is the # histogram. fig = plt.figure(figsize=(12, 5)) ax = fig.add_subplot(111) # Scatter plot of data samples and histogram ax.scatter(obs_dist, np.abs(np.random.randn(obs_dist.size)), zorder=15, color='red', marker='x',
Created on Mon Dec 16 11:02:59 2013 Author: Josef Perktold """ import numpy as np from scipy import stats import matplotlib.pyplot as plt import statsmodels.nonparametric.api as npar from statsmodels.sandbox.nonparametric import kernels from statsmodels.distributions.mixture_rvs import mixture_rvs # example from test_kde.py mixture of two normal distributions np.random.seed(12345) x = mixture_rvs([.25,.75], size=200, dist=[stats.norm, stats.norm], kwargs = (dict(loc=-1, scale=.5),dict(loc=1, scale=.5))) x.sort() # not needed kde = npar.KDEUnivariate(x) kde.fit('gau') ci = kde.kernel.density_confint(kde.density, len(x)) fig = plt.figure() ax = fig.add_subplot(1, 1, 1) ax.hist(x, bins=15, normed=True, alpha=0.25) ax.plot(kde.support, kde.density, lw=2, color='red') ax.fill_between(kde.support, ci[:,0], ci[:,1], color='grey', alpha='0.7') ax.set_title('Kernel Density Gaussian (bw = %4.2f)' % kde.bw)
if __name__ == '__main__': examples = ['chebyt', 'fourier', 'hermite']#[2] nobs = 10000 import matplotlib.pyplot as plt from statsmodels.distributions.mixture_rvs import ( mixture_rvs, MixtureDistribution) #np.random.seed(12345) ## obs_dist = mixture_rvs([1/3.,2/3.], size=nobs, dist=[stats.norm, stats.norm], ## kwargs = (dict(loc=-1,scale=.5),dict(loc=1,scale=.75))) mix_kwds = (dict(loc=-0.5,scale=.5),dict(loc=1,scale=.2)) obs_dist = mixture_rvs([1/3.,2/3.], size=nobs, dist=[stats.norm, stats.norm], kwargs=mix_kwds) mix = MixtureDistribution() #obs_dist = np.random.randn(nobs)/4. #np.sqrt(2) if "chebyt_" in examples: # needed for Cheby example below #obs_dist = np.clip(obs_dist, -2, 2)/2.01 #chebyt [0,1] obs_dist = obs_dist[(obs_dist>-2) & (obs_dist<2)]/2.0 #/4. + 2/4.0 #fourier [0,1] #obs_dist = obs_dist[(obs_dist>-2) & (obs_dist<2)]/4. + 2/4.0 f_hat, grid, coeffs, polys = density_orthopoly(obs_dist, ChebyTPoly, order=20, xeval=None) #f_hat /= f_hat.sum() * (grid.max() - grid.min())/len(grid) f_hat0 = f_hat from scipy import integrate
Performance of normal reference plug-in estimator vs silverman. Sample is drawn from a mixture of gaussians. Distribution has been chosen to be reasoanbly close to normal. """ import numpy as np from scipy import stats import matplotlib.pyplot as plt import statsmodels.nonparametric.api as npar from statsmodels.sandbox.nonparametric import kernels from statsmodels.distributions.mixture_rvs import mixture_rvs # example from test_kde.py mixture of two normal distributions np.random.seed(12345) x = mixture_rvs([.1, .9], size=200, dist=[stats.norm, stats.norm], kwargs=(dict(loc=0, scale=.5), dict(loc=1, scale=.5))) kde = npar.KDEUnivariate(x) kernel_names = [ 'Gaussian', 'Epanechnikov', 'Biweight', 'Triangular', 'Triweight', 'Cosine' ] kernel_switch = ['gau', 'epa', 'tri', 'biw', 'triw', 'cos'] def true_pdf(x): pdf = 0.1 * stats.norm.pdf(x, loc=0, scale=0.5) pdf += 0.9 * stats.norm.pdf(x, loc=1, scale=0.5) return pdf
## Kernel Density Estimation import numpy as np from scipy import stats import statsmodels.api as sm import matplotlib.pyplot as plt from statsmodels.distributions.mixture_rvs import mixture_rvs ##### A univariate example. np.random.seed(12345) obs_dist1 = mixture_rvs([.25,.75], size=10000, dist=[stats.norm, stats.norm], kwargs = (dict(loc=-1,scale=.5),dict(loc=1,scale=.5))) kde = sm.nonparametric.KDEUnivariate(obs_dist1) kde.fit() fig = plt.figure(figsize=(12,8)) ax = fig.add_subplot(111) ax.hist(obs_dist1, bins=50, normed=True, color='red') ax.plot(kde.support, kde.density, lw=2, color='black'); obs_dist2 = mixture_rvs([.25,.75], size=10000, dist=[stats.norm, stats.beta], kwargs = (dict(loc=-1,scale=.5),dict(loc=1,scale=1,args=(1,.5))))
# ## A univariate example np.random.seed( 12345) # Seed the random number generator for reproducible results # We create a bimodal distribution: a mixture of two normal distributions # with locations at `-1` and `1`. # Location, scale and weight for the two distributions dist1_loc, dist1_scale, weight1 = -1, .5, .25 dist2_loc, dist2_scale, weight2 = 1, .5, .75 # Sample from a mixture of distributions obs_dist = mixture_rvs( prob=[weight1, weight2], size=250, dist=[stats.norm, stats.norm], kwargs=(dict(loc=dist1_loc, scale=dist1_scale), dict(loc=dist2_loc, scale=dist2_scale))) # The simplest non-parametric technique for density estimation is the # histogram. fig = plt.figure(figsize=(12, 5)) ax = fig.add_subplot(111) # Scatter plot of data samples and histogram ax.scatter( obs_dist, np.abs(np.random.randn(obs_dist.size)), zorder=15, color='red',
import numpy as np from scipy import stats from statsmodels.sandbox.nonparametric import kernels from statsmodels.distributions.mixture_rvs import mixture_rvs from statsmodels.nonparametric.bandwidths import select_bandwidth from statsmodels.nonparametric.bandwidths import bw_normal_reference from numpy.testing import assert_allclose import pytest # setup test data np.random.seed(12345) Xi = mixture_rvs([.25, .75], size=200, dist=[stats.norm, stats.norm], kwargs=(dict(loc=-1, scale=.5), dict(loc=1, scale=.5))) class TestBandwidthCalculation: def test_calculate_bandwidth_gaussian(self): bw_expected = [ 0.29774853596742024, 0.25304408155871411, 0.29781147113698891 ] kern = kernels.Gaussian() bw_calc = [0, 0, 0] for ii, bw in enumerate(['scott', 'silverman', 'normal_reference']): bw_calc[ii] = select_bandwidth(Xi, bw, kern)
if __name__ == '__main__': examples = ['chebyt', 'fourier', 'hermite']#[2] nobs = 10000 import matplotlib.pyplot as plt from statsmodels.distributions.mixture_rvs import ( mixture_rvs, MixtureDistribution) #np.random.seed(12345) ## obs_dist = mixture_rvs([1/3.,2/3.], size=nobs, dist=[stats.norm, stats.norm], ## kwargs = (dict(loc=-1,scale=.5),dict(loc=1,scale=.75))) mix_kwds = (dict(loc=-0.5,scale=.5),dict(loc=1,scale=.2)) obs_dist = mixture_rvs([1/3.,2/3.], size=nobs, dist=[stats.norm, stats.norm], kwargs=mix_kwds) mix = MixtureDistribution() #obs_dist = np.random.randn(nobs)/4. #np.sqrt(2) if "chebyt_" in examples: # needed for Cheby example below #obs_dist = np.clip(obs_dist, -2, 2)/2.01 #chebyt [0,1] obs_dist = obs_dist[(obs_dist>-2) & (obs_dist<2)]/2.0 #/4. + 2/4.0 #fourier [0,1] #obs_dist = obs_dist[(obs_dist>-2) & (obs_dist<2)]/4. + 2/4.0 f_hat, grid, coeffs, polys = density_orthopoly(obs_dist, ChebyTPoly, order=20, xeval=None) #f_hat /= f_hat.sum() * (grid.max() - grid.min())/len(grid) f_hat0 = f_hat fint = integrate.trapz(f_hat, grid)# dx=(grid.max() - grid.min())/len(grid))
# In[101]: import numpy as np from scipy import stats import statsmodels.api as sm import matplotlib.pyplot as plt from statsmodels.distributions.mixture_rvs import mixture_rvs # In[102]: np.random.seed(12345) # In[103]: mixture_rvs([.25, .75], size=11, dist=[stats.norm, stats.norm], kwargs=(dict(loc=-1, scale=.5), dict(loc=1, scale=.5))) # In[104]: obs_dist1 = mixture_rvs([.25, .75], size=10000, dist=[stats.norm, stats.norm], kwargs=(dict(loc=-1, scale=.5), dict(loc=1, scale=.5))) df.astype(float) #data = np.asarray( df2["Totals"] ) # In[105]: data #= data.astype(float)