Exemplo n.º 1
0
def test_uni_kde():
    samples = mixture_rvs([.25, .75], size=10000,
                          dist=[stats.norm, stats.norm],
                          kwargs=(
                              dict(loc=-1, scale=.5), dict(loc=1, scale=.5)))

    bandwidth = approx_bandwidth(samples)

    # Run statsmodel for reference
    kde = sm.nonparametric.KDEUnivariate(samples)
    kde.fit(kernel="gau")

    # Reuse statsmodel support for our test
    support = kde.support

    # Run custom KDE
    pdf = np.zeros_like(support)
    uni_kde_seq(support, samples, bandwidth, pdf)

    # Check value
    expect = kde.density
    got = pdf

    rms = calc_rms(expect, got, norm=True)
    assert rms < 1e-2, "RMS error too high: {0}".format(rms)
Exemplo n.º 2
0
def test_uni_kde():
    samples = mixture_rvs([.25, .75],
                          size=10000,
                          dist=[stats.norm, stats.norm],
                          kwargs=(dict(loc=-1, scale=.5), dict(loc=1,
                                                               scale=.5)))

    bandwidth = approx_bandwidth(samples)

    # Run statsmodel for reference
    kde = sm.nonparametric.KDEUnivariate(samples)
    kde.fit(kernel="gau")

    # Reuse statsmodel support for our test
    support = kde.support

    # Run custom KDE
    pdf = np.zeros_like(support)
    uni_kde_seq(support, samples, bandwidth, pdf)

    # Check value
    expect = kde.density
    got = pdf

    rms = calc_rms(expect, got, norm=True)
    assert rms < 1e-2, "RMS error too high: {0}".format(rms)
Exemplo n.º 3
0
def test_hsa_uni_kde_ver2():
    np.random.seed(12345)

    samples = mixture_rvs(
        [0.25, 0.75],
        size=10000,
        dist=[stats.norm, stats.norm],
        kwargs=(dict(loc=-1, scale=0.5), dict(loc=1, scale=0.5)),
    )

    bandwidth = approx_bandwidth(samples)

    # Run statsmodel for reference
    kde = sm.nonparametric.KDEUnivariate(samples)
    kde.fit(kernel="gau", fft=False)

    # Reuse statsmodel support for our test
    support = kde.support

    # Run custom KDE
    pdf = np.zeros_like(support)
    hsa_uni_kde_ver2(support, samples, bandwidth, pdf)

    # Check value
    expect = kde.density
    got = pdf

    rms = calc_rms(expect, got, norm=True)
    print("RMS", rms)
    assert rms < 1e-2, "RMS error too high: {0}".format(rms)
Exemplo n.º 4
0
 def bimodal_samples(n):
     """
        Samples from a bimodal distribution
        The script in this function is taken from
        https://www.statsmodels.org/stable/examples/notebooks/generated/kernel_density.html
     """
     # Location, scale and weight for the two distributions
     dist1_loc, dist1_scale, weight1 = -1, .4, .3
     dist2_loc, dist2_scale, weight2 = 1, .5, .7
     # Sample from a mixture of distributions
     f = mixture_rvs(prob=[weight1, weight2],
                     size=n,
                     dist=[stats.norm, stats.norm],
                     kwargs=(dict(loc=dist1_loc, scale=dist1_scale),
                             dict(loc=dist2_loc, scale=dist2_scale)))
     return f
Exemplo n.º 5
0
def plot_uni_test():
    samples = mixture_rvs([.25, .75],
                          size=10000,
                          dist=[stats.norm, stats.norm],
                          kwargs=(dict(loc=-1, scale=.5), dict(loc=1,
                                                               scale=.5)))

    bandwidth = approx_bandwidth(samples)

    print('bandwidth', bandwidth)
    print('size', samples.size)

    support = build_support(samples, bandwidth)
    pdf = np.zeros_like(support)

    uni_kde_seq(support, samples, bandwidth, pdf)

    # Plotting
    output_file("kde.html")

    p1 = figure(title="Hist")
    hist, edges = np.histogram(samples, bins=50, density=True)
    p1.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:])

    p2 = figure(title="KDE")
    p2.line(support, pdf)
    p2.circle(x=support, y=pdf, size=5)

    p3 = figure(title="KDE-SM")
    kde = sm.nonparametric.KDEUnivariate(samples)
    kde.fit(kernel="gau")

    p3.line(kde.support, kde.density)

    print(samples.size)
    print(len(kde.support), len(kde.density))

    print(kde.density.sum())
    show(column(p1, p2, p3))
Exemplo n.º 6
0
def plot_uni_test():
    samples = mixture_rvs([.25, .75], size=10000,
                          dist=[stats.norm, stats.norm],
                          kwargs=(
                              dict(loc=-1, scale=.5), dict(loc=1, scale=.5)))

    bandwidth = approx_bandwidth(samples)

    print('bandwidth', bandwidth)
    print('size', samples.size)

    support = build_support(samples, bandwidth)
    pdf = np.zeros_like(support)

    uni_kde_seq(support, samples, bandwidth, pdf)

    # Plotting
    output_file("kde.html")

    p1 = figure(title="Hist")
    hist, edges = np.histogram(samples, bins=50, density=True)
    p1.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:])

    p2 = figure(title="KDE")
    p2.line(support, pdf)
    p2.circle(x=support, y=pdf, size=5)

    p3 = figure(title="KDE-SM")
    kde = sm.nonparametric.KDEUnivariate(samples)
    kde.fit(kernel="gau")

    p3.line(kde.support, kde.density)

    print(samples.size)
    print(len(kde.support), len(kde.density))

    print(kde.density.sum())
    show(vplot(p1, p2, p3))
Exemplo n.º 7
0
    def driver(imp_dict, retry=3, size=10000):
        print("Running univariate kde benchmark on size = {size}".format(
            size=size))
        samples = mixture_rvs([.25, .75], size=size,
                              dist=[stats.norm, stats.norm],
                              kwargs=(
                                  dict(loc=-1, scale=.5),
                                  dict(loc=1, scale=.5)))

        bandwidth = approx_bandwidth(samples)

        # Run statsmodel for reference
        kde = sm.nonparametric.KDEUnivariate(samples)
        kde.fit(kernel="gau", fft=False)

        # Reuse statsmodel support for our test
        support = kde.support
        expect = kde.density

        timing = OrderedDict()

        # Run timing loop
        for name, imp in imp_dict.items():
            print("Running {name}".format(name=name))
            times = []
            for t in range(retry):
                print(" trial = {t}".format(t=t), end=' ... ')
                got, elapsed = imp(support, samples, bandwidth, timer)
                print("elapsed =", elapsed, end=' | ')
                times.append(elapsed)
                rms = calc_rms(expect, got, norm=True)
                print("RMS =", rms)
                if rms > 0.01:
                    print("*** warning, RMS is too high")
            timing[name] = times

        return timing
Exemplo n.º 8
0
    def driver(imp_dict, retry=3, size=10000):
        print("Running univariate kde benchmark on size = {size}".format(size=size))
        samples = mixture_rvs(
            [0.25, 0.75],
            size=size,
            dist=[stats.norm, stats.norm],
            kwargs=(dict(loc=-1, scale=0.5), dict(loc=1, scale=0.5)),
        )

        bandwidth = approx_bandwidth(samples)

        # Run statsmodel for reference
        kde = sm.nonparametric.KDEUnivariate(samples)
        kde.fit(kernel="gau", fft=False)

        # Reuse statsmodel support for our test
        support = kde.support
        expect = kde.density

        timing = OrderedDict()

        # Run timing loop
        for name, imp in imp_dict.items():
            print("Running {name}".format(name=name))
            times = []
            for t in range(retry):
                print(" trial = {t}".format(t=t), end=" ... ")
                got, elapsed = imp(support, samples, bandwidth, timer)
                print("elapsed =", elapsed, end=" | ")
                times.append(elapsed)
                rms = calc_rms(expect, got, norm=True)
                print("RMS =", rms)
                if rms > 0.01:
                    print("*** warning, RMS is too high")
            timing[name] = times

        return timing
Performance of normal reference plug-in estimator vs silverman. Sample is drawn
from a mixture of gaussians. Distribution has been chosen to be reasoanbly close
to normal.
"""

from __future__ import print_function
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import statsmodels.nonparametric.api as npar
from statsmodels.sandbox.nonparametric import kernels
from statsmodels.distributions.mixture_rvs import mixture_rvs

# example from test_kde.py mixture of two normal distributions
np.random.seed(12345)
x = mixture_rvs([.1, .9], size=200, dist=[stats.norm, stats.norm],
                kwargs=(dict(loc=0, scale=.5), dict(loc=1, scale=.5)))

kde = npar.KDEUnivariate(x)


kernel_names = ['Gaussian', 'Epanechnikov', 'Biweight',
                'Triangular', 'Triweight', 'Cosine'
                ]

kernel_switch = ['gau', 'epa', 'tri', 'biw',
                 'triw', 'cos'
                 ]


def true_pdf(x):
    pdf = 0.1 * stats.norm.pdf(x, loc=0, scale=0.5)
Exemplo n.º 10
0
# ## A univariate example

np.random.seed(
    12345)  # Seed the random number generator for reproducible results

# We create a bimodal distribution: a mixture of two normal distributions
# with locations at `-1` and `1`.

# Location, scale and weight for the two distributions
dist1_loc, dist1_scale, weight1 = -1, .5, .25
dist2_loc, dist2_scale, weight2 = 1, .5, .75

# Sample from a mixture of distributions
obs_dist = mixture_rvs(prob=[weight1, weight2],
                       size=250,
                       dist=[stats.norm, stats.norm],
                       kwargs=(dict(loc=dist1_loc, scale=dist1_scale),
                               dict(loc=dist2_loc, scale=dist2_scale)))

# The simplest non-parametric technique for density estimation is the
# histogram.

fig = plt.figure(figsize=(12, 5))
ax = fig.add_subplot(111)

# Scatter plot of data samples and histogram
ax.scatter(obs_dist,
           np.abs(np.random.randn(obs_dist.size)),
           zorder=15,
           color='red',
           marker='x',
Exemplo n.º 11
0
Created on Mon Dec 16 11:02:59 2013

Author: Josef Perktold
"""

import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import statsmodels.nonparametric.api as npar
from statsmodels.sandbox.nonparametric import kernels
from statsmodels.distributions.mixture_rvs import mixture_rvs

# example from test_kde.py mixture of two normal distributions
np.random.seed(12345)
x = mixture_rvs([.25,.75], size=200, dist=[stats.norm, stats.norm],
                kwargs = (dict(loc=-1, scale=.5),dict(loc=1, scale=.5)))

x.sort() # not needed

kde = npar.KDEUnivariate(x)
kde.fit('gau')
ci = kde.kernel.density_confint(kde.density, len(x))

fig = plt.figure()
ax = fig.add_subplot(1, 1, 1)
ax.hist(x, bins=15, normed=True, alpha=0.25)
ax.plot(kde.support, kde.density, lw=2, color='red')
ax.fill_between(kde.support, ci[:,0], ci[:,1],
                    color='grey', alpha='0.7')
ax.set_title('Kernel Density Gaussian (bw = %4.2f)' % kde.bw)
Exemplo n.º 12
0
if __name__ == '__main__':

    examples = ['chebyt', 'fourier', 'hermite']#[2]

    nobs = 10000

    import matplotlib.pyplot as plt
    from statsmodels.distributions.mixture_rvs import (
                                                mixture_rvs, MixtureDistribution)

    #np.random.seed(12345)
##    obs_dist = mixture_rvs([1/3.,2/3.], size=nobs, dist=[stats.norm, stats.norm],
##                   kwargs = (dict(loc=-1,scale=.5),dict(loc=1,scale=.75)))
    mix_kwds = (dict(loc=-0.5,scale=.5),dict(loc=1,scale=.2))
    obs_dist = mixture_rvs([1/3.,2/3.], size=nobs, dist=[stats.norm, stats.norm],
                   kwargs=mix_kwds)
    mix = MixtureDistribution()

    #obs_dist = np.random.randn(nobs)/4. #np.sqrt(2)


    if "chebyt_" in examples: # needed for Cheby example below
        #obs_dist = np.clip(obs_dist, -2, 2)/2.01
        #chebyt [0,1]
        obs_dist = obs_dist[(obs_dist>-2) & (obs_dist<2)]/2.0 #/4. + 2/4.0
        #fourier [0,1]
        #obs_dist = obs_dist[(obs_dist>-2) & (obs_dist<2)]/4. + 2/4.0
        f_hat, grid, coeffs, polys = density_orthopoly(obs_dist, ChebyTPoly, order=20, xeval=None)
        #f_hat /= f_hat.sum() * (grid.max() - grid.min())/len(grid)
        f_hat0 = f_hat
        from scipy import integrate
Exemplo n.º 13
0
Performance of normal reference plug-in estimator vs silverman. Sample is drawn
from a mixture of gaussians. Distribution has been chosen to be reasoanbly close
to normal.
"""

import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import statsmodels.nonparametric.api as npar
from statsmodels.sandbox.nonparametric import kernels
from statsmodels.distributions.mixture_rvs import mixture_rvs

# example from test_kde.py mixture of two normal distributions
np.random.seed(12345)
x = mixture_rvs([.1, .9],
                size=200,
                dist=[stats.norm, stats.norm],
                kwargs=(dict(loc=0, scale=.5), dict(loc=1, scale=.5)))

kde = npar.KDEUnivariate(x)

kernel_names = [
    'Gaussian', 'Epanechnikov', 'Biweight', 'Triangular', 'Triweight', 'Cosine'
]

kernel_switch = ['gau', 'epa', 'tri', 'biw', 'triw', 'cos']


def true_pdf(x):
    pdf = 0.1 * stats.norm.pdf(x, loc=0, scale=0.5)
    pdf += 0.9 * stats.norm.pdf(x, loc=1, scale=0.5)
    return pdf
Exemplo n.º 14
0
## Kernel Density Estimation

import numpy as np
from scipy import stats
import statsmodels.api as sm
import matplotlib.pyplot as plt
from statsmodels.distributions.mixture_rvs import mixture_rvs


##### A univariate example.

np.random.seed(12345)


obs_dist1 = mixture_rvs([.25,.75], size=10000, dist=[stats.norm, stats.norm],
                kwargs = (dict(loc=-1,scale=.5),dict(loc=1,scale=.5)))


kde = sm.nonparametric.KDEUnivariate(obs_dist1)
kde.fit()


fig = plt.figure(figsize=(12,8))
ax = fig.add_subplot(111)
ax.hist(obs_dist1, bins=50, normed=True, color='red')
ax.plot(kde.support, kde.density, lw=2, color='black');


obs_dist2 = mixture_rvs([.25,.75], size=10000, dist=[stats.norm, stats.beta],
            kwargs = (dict(loc=-1,scale=.5),dict(loc=1,scale=1,args=(1,.5))))
Exemplo n.º 15
0
# ## A univariate example

np.random.seed(
    12345)  # Seed the random number generator for reproducible results

# We create a bimodal distribution: a mixture of two normal distributions
# with locations at `-1` and `1`.

# Location, scale and weight for the two distributions
dist1_loc, dist1_scale, weight1 = -1, .5, .25
dist2_loc, dist2_scale, weight2 = 1, .5, .75

# Sample from a mixture of distributions
obs_dist = mixture_rvs(
    prob=[weight1, weight2],
    size=250,
    dist=[stats.norm, stats.norm],
    kwargs=(dict(loc=dist1_loc, scale=dist1_scale),
            dict(loc=dist2_loc, scale=dist2_scale)))

# The simplest non-parametric technique for density estimation is the
# histogram.

fig = plt.figure(figsize=(12, 5))
ax = fig.add_subplot(111)

# Scatter plot of data samples and histogram
ax.scatter(
    obs_dist,
    np.abs(np.random.randn(obs_dist.size)),
    zorder=15,
    color='red',
Exemplo n.º 16
0
import numpy as np
from scipy import stats

from statsmodels.sandbox.nonparametric import kernels
from statsmodels.distributions.mixture_rvs import mixture_rvs
from statsmodels.nonparametric.bandwidths import select_bandwidth
from statsmodels.nonparametric.bandwidths import bw_normal_reference

from numpy.testing import assert_allclose
import pytest

# setup test data

np.random.seed(12345)
Xi = mixture_rvs([.25, .75],
                 size=200,
                 dist=[stats.norm, stats.norm],
                 kwargs=(dict(loc=-1, scale=.5), dict(loc=1, scale=.5)))


class TestBandwidthCalculation:
    def test_calculate_bandwidth_gaussian(self):

        bw_expected = [
            0.29774853596742024, 0.25304408155871411, 0.29781147113698891
        ]

        kern = kernels.Gaussian()

        bw_calc = [0, 0, 0]
        for ii, bw in enumerate(['scott', 'silverman', 'normal_reference']):
            bw_calc[ii] = select_bandwidth(Xi, bw, kern)
Exemplo n.º 17
0
if __name__ == '__main__':

    examples = ['chebyt', 'fourier', 'hermite']#[2]

    nobs = 10000

    import matplotlib.pyplot as plt
    from statsmodels.distributions.mixture_rvs import (
                                                mixture_rvs, MixtureDistribution)

    #np.random.seed(12345)
##    obs_dist = mixture_rvs([1/3.,2/3.], size=nobs, dist=[stats.norm, stats.norm],
##                   kwargs = (dict(loc=-1,scale=.5),dict(loc=1,scale=.75)))
    mix_kwds = (dict(loc=-0.5,scale=.5),dict(loc=1,scale=.2))
    obs_dist = mixture_rvs([1/3.,2/3.], size=nobs, dist=[stats.norm, stats.norm],
                   kwargs=mix_kwds)
    mix = MixtureDistribution()

    #obs_dist = np.random.randn(nobs)/4. #np.sqrt(2)


    if "chebyt_" in examples: # needed for Cheby example below
        #obs_dist = np.clip(obs_dist, -2, 2)/2.01
        #chebyt [0,1]
        obs_dist = obs_dist[(obs_dist>-2) & (obs_dist<2)]/2.0 #/4. + 2/4.0
        #fourier [0,1]
        #obs_dist = obs_dist[(obs_dist>-2) & (obs_dist<2)]/4. + 2/4.0
        f_hat, grid, coeffs, polys = density_orthopoly(obs_dist, ChebyTPoly, order=20, xeval=None)
        #f_hat /= f_hat.sum() * (grid.max() - grid.min())/len(grid)
        f_hat0 = f_hat
        fint = integrate.trapz(f_hat, grid)# dx=(grid.max() - grid.min())/len(grid))
Exemplo n.º 18
0
# In[101]:

import numpy as np
from scipy import stats
import statsmodels.api as sm
import matplotlib.pyplot as plt
from statsmodels.distributions.mixture_rvs import mixture_rvs

# In[102]:

np.random.seed(12345)

# In[103]:

mixture_rvs([.25, .75],
            size=11,
            dist=[stats.norm, stats.norm],
            kwargs=(dict(loc=-1, scale=.5), dict(loc=1, scale=.5)))

# In[104]:

obs_dist1 = mixture_rvs([.25, .75],
                        size=10000,
                        dist=[stats.norm, stats.norm],
                        kwargs=(dict(loc=-1, scale=.5), dict(loc=1, scale=.5)))
df.astype(float)
#data = np.asarray( df2["Totals"] )

# In[105]:

data  #= data.astype(float)