示例#1
0
class KDE4BO(BaseDensityEstimator):
    def __init__(self,
                 top_n_percent=15,
                 bandwidth_factor=3,
                 min_bandwidth=1e3,
                 bw_estimation="normal_reference",
                 min_points_in_kde=2):
        super(KDE4BO,
              self).__init__(top_n_percent, bandwidth_factor, min_bandwidth,
                             bw_estimation, min_points_in_kde)
        self.good_kde = None
        self.bad_kde = None

    def fit(self, X: np.ndarray, y: np.ndarray):
        super(KDE4BO, self).fit(X, y)
        self.kde_vartypes = "".join([
            "u" if n_choices > 0 else "c"
            for n_choices in self.config_transformer.n_choices_list
        ])
        n_good = max(2, (self.top_n_percent * X.shape[0]) // 100)
        N = X.shape[0]
        L = len(self.config_transformer.n_choices_list)
        if n_good <= L or N - n_good <= L:
            return None
        idx = np.argsort(y)
        if self.good_kde is None:
            good_kde_bw = np.zeros(
                [len(self.config_transformer.n_choices_list)]) + 0.1
            bad_kde_bw = deepcopy(good_kde_bw)
        else:
            good_kde_bw = self.good_kde.bw
            bad_kde_bw = self.bad_kde.bw
        X_good = X[idx[:n_good]]
        X_bad = X[idx[n_good:]]
        for X_, bw_vector in zip([X_good, X_bad], [good_kde_bw, bad_kde_bw]):
            M = X_.shape[1]
            for i in range(M):
                bw = bw_vector[i]
                n_choices = self.config_transformer.n_choices_list[i]
                X_[:, i] = self.process_constants_vector(X_[:, i],
                                                         n_choices,
                                                         bw,
                                                         mode="replace")
        self.good_kde = KDEMultivariate(data=X_good,
                                        var_type=self.kde_vartypes,
                                        bw=self.bw_estimation)
        self.bad_kde = KDEMultivariate(data=X_bad,
                                       var_type=self.kde_vartypes,
                                       bw=self.bw_estimation)
        return self

    def predict(self, X: np.ndarray):
        super(KDE4BO, self).predict(X)
        good_pdf = self.good_kde.pdf(X)
        bad_pdf = self.bad_kde.pdf(X)
        return good_pdf / bad_pdf
示例#2
0
文件: entropy.py 项目: bumps/bumps
def kde_entropy_statsmodels(points, n_est=None):
    """
    Use statsmodels KDEMultivariate pdf to estimate entropy.

    Density evaluated at sample points.

    Slow and fails for bimodal, dirichlet; poor for high dimensional MVN.
    """
    from statsmodels.nonparametric.kernel_density import KDEMultivariate
    n, d = points.shape

    # Default to the full set
    if n_est is None:
        n_est = n

    # reduce size of draw to n_est
    if n_est >= n:
        x = points
    else:
        x = points[permutation(n)[:n_est]]
        n = n_est

    predictor = KDEMultivariate(data=x, var_type='c'*d)
    p = predictor.pdf()
    H = -np.mean(log(p))
    return H / LN2
示例#3
0
def wind_dir_pressure(year=2013):
    from statsmodels.nonparametric.kernel_density import KDEMultivariate as KDE
    import robust as rb

    min2 = 0
    sigfac = 3
    sigsamp = 5

    d = get_data(year=year)
    wdir = d["winddir_deg"]
    
    wdir_rand = wdir + np.random.normal(0,12,len(wdir))
    bad = np.isnan(wdir_rand)
    wdir_rand[bad] = np.random.uniform(0,360,np.sum(bad))
    
    press = d["pressure"]
    
    dist1 = wdir_rand
    dist2 = press
    
    med1 = np.median(dist1)
    sig1 = rb.std(dist1)
    datamin1 = np.min(dist1)
    datamax1 = np.max(dist1)
    min1 = 0.0
    max1 = 360.0


    med2 = np.median(dist2)
    sig2 = rb.std(dist2)
    datamin2 = np.min(dist2)
    datamax2 = np.max(dist2)
    min2 = np.min(dist2)
    max2 = np.max(dist2)
    
    X, Y = np.mgrid[min1:max1:100j, min2:max2:100j]
    positions = np.vstack([X.ravel(), Y.ravel()])
    values = np.vstack([dist1, dist2])
    
    kernel = KDE(values,var_type='cc',bw=[sig1/sigsamp,sig2/sigsamp])
    Z = np.reshape(kernel.pdf(positions).T, X.shape)
    
    aspect = (max1-min1)/(max2-min2) * 8.5/11.0

    plot_params()
    plt.ion()
    plt.figure(5,figsize=(11,8.5))
    plt.clf()
    ax = plt.subplot(111)
    ax.imshow(np.rot90(Z), cmap=plt.cm.CMRmap_r,aspect=aspect, \
              extent=[min1, max1, min2, max2],origin='upper')
    ax.yaxis.labelpad = 12
    ax.set_ylabel('Atmospheric Pressure (in-Hg)',fontsize=fs)
    ax.set_xlabel('Wind Direction (degrees)',fontsize=fs)
    plt.title('Wind Direction and Pressure at Thacher Observatory in '+str(year),fontsize=fs)
    
    plt.savefig('Wind_Direction_Pressure_'+str(year)+'.png',dpi=300)
    mpl.rcdefaults()

    return
示例#4
0
文件: entropy.py 项目: llimeht/bumps
def kde_entropy_statsmodels(points, n_est=None):
    """
    Use statsmodels KDEMultivariate pdf to estimate entropy.

    Density evaluated at sample points.

    Slow and fails for bimodal, dirichlet; poor for high dimensional MVN.
    """
    from statsmodels.nonparametric.kernel_density import KDEMultivariate
    n, d = points.shape

    # Default to the full set
    if n_est is None:
        n_est = n

    # reduce size of draw to n_est
    if n_est >= n:
        x = points
    else:
        x = points[permutation(n)[:n_est]]
        n = n_est

    predictor = KDEMultivariate(data=x, var_type='c' * d)
    p = predictor.pdf()
    H = -np.mean(log(p))
    return H / LN2
示例#5
0
def plot_density_panel(chains, names=None, hist_on=False, figsizeinches=None):
    '''
    Plot marginal posterior densities

    Args:
        * **chains** (:class:`~numpy.ndarray`): Sampling chain for each parameter
        * **names** (:py:class:`list`): List of strings - name of each parameter
        * **hist_on** (:py:class:`bool`): Flag to include histogram on density plot
        * **figsizeinches** (:py:class:`list`): Specify figure size in inches [Width, Height]
    '''
    nsimu, nparam = chains.shape  # number of rows, number of columns
    ns1, ns2, names, figsizeinches = setup_plot_features(nparam=nparam, names=names, figsizeinches=figsizeinches)
    f = plt.figure(dpi=100, figsize=(figsizeinches))  # initialize figure
    for ii in range(nparam):
        # define chain
        chain = chains[:, ii].reshape(nsimu, 1)  # check indexing
        # define x grid
        chain_grid = make_x_grid(chain)
        # Compute kernel density estimate
        kde = KDEMultivariate(chain, bw='normal_reference', var_type='c')
        # plot density on subplot
        plt.subplot(ns1, ns2, ii+1)
        if hist_on is True:  # include histograms
            hist(chain, density=True)
        plt.plot(chain_grid, kde.pdf(chain_grid), 'k')
        # format figure
        plt.xlabel(names[ii])
        plt.ylabel(str('$\\pi$({}$|M^{}$)'.format(names[ii], '{data}')))
        plt.tight_layout(rect=[0, 0.03, 1, 0.95], h_pad=1.0)  # adjust spacing
    return f
示例#6
0
def kde_statsmodels_m(x: np.array, x_grid: np.array) -> np.array:
    """Multivariate Kernel Density Estimation with Statsmodels"""
    kde = KDEMultivariate(
        x,
        bw='cv_ml',  # bandwidth * np.ones_like(x),
        var_type='u')
    return kde.pdf(x_grid)
def speed_graphs(N0=0, N=4500, vmax=3, resolution=300):
    data = dict(
        (n, np.genfromtxt("pdf/v-{0:04d}.csv".format(n), delimiter=' '))
        for n in range(N))
    Tdata = np.genfromtxt("bulk.csv", delimiter=' ')
    T = Tdata[:, 2]
    t = Tdata[:, 1]

    x = np.linspace(0, vmax, resolution)

    for n in np.arange(N0, N):
        kde = KDEMultivariate(data[n], bw='normal_reference', var_type='c')
        fig = plt.figure()
        ax = fig.gca()
        fig.subplots_adjust(wspace=0)
        fig.suptitle("Time = {0:.2f} s".format(t[n]), fontsize=7)

        ax.set_ylim(-0.01, 2.5)
        plt.xlabel("Velocity norm")
        plt.ylabel("PDF")
        # Fix the seed for reproducibility
        ax.plot(x, kde.pdf(x), label="Simulation")
        ax.plot(x,
                maxwell_boltzman_speed(v=x, m=1, kT=T[n]),
                label="Maxwell-Boltzmann")
        ax.legend(loc='upper right', shadow=True)
        fig.savefig("v-pdf{0:04d}.png".format(n), bbox_inches='tight', dpi=300)
        plt.close()
def velocity_graphs(N0=0, N=4500, vmax=1, resolution=0.05):
    data = dict(
        (n, np.genfromtxt("pdf/VX-{0:04d}.csv".format(n), delimiter=' '))
        for n in range(N))
    Tdata = np.genfromtxt("bulk.csv", delimiter=' ')
    # T = Tdata[:, 2]
    t = Tdata[:, 1]

    x, y = np.mgrid[-vmax:vmax:resolution, -vmax:vmax:resolution]

    for n in np.arange(N0, N):
        kde = KDEMultivariate(data=data[n][:, 3:5],
                              bw='normal_reference',
                              var_type='cc')
        fig = plt.figure()
        ax = fig.gca()
        fig.subplots_adjust(wspace=0)
        fig.suptitle("Time = {0:.2f} s".format(t[n]), fontsize=7)

        plt.xlabel("$x$-velocity")
        plt.ylabel("$y$-velocity")
        nx = x.shape[0]
        ny = x.shape[1]
        pdf = np.zeros((nx, ny))
        print("Evaluating the function")
        for i in range(nx):
            for j in range(ny):
                pdf[i, j] = kde.pdf([x[i, j], y[i, j]])

        #cs = ax.contour(x, y, pdf, vmin=0.0, vmax=1.6, label="Simulation")
        cs = ax.contour(x, y, pdf, label="Simulation", cmap=plt.cm.Paired)
        cs.set_clim(0, 1.6)
        plt.clabel(cs, inline=1, fontsize=5, fmt="%1.1f")
        fig.savefig("v-pdf{0:04d}.png".format(n), bbox_inches='tight', dpi=300)
        plt.close()
def estimate_kernel_density(
    coordinates,
    variable_types=None,
    bandwidths="cv_ml",
    mins=None,
    maxs=None,
    grid_sizes=None,
):

    n_dimension = len(coordinates)

    if variable_types is None:

        variable_types = "c" * n_dimension

    kde_multivariate = KDEMultivariate(
        coordinates, var_type=variable_types, bw=bandwidths
    )

    if mins is None:

        mins = tuple(coordinate.min() for coordinate in coordinates)

    if maxs is None:

        maxs = tuple(coordinate.max() for coordinate in coordinates)

    if grid_sizes is None:

        grid_sizes = (64,) * n_dimension

    return kde_multivariate.pdf(
        make_mesh_grid_coordinates_per_axis(mins, maxs, grid_sizes)
    ).reshape(grid_sizes)
示例#10
0
    def calculatePDF(self, tracks):
        """
        Calculate a 2-d probability density surface using kernel density
        estimation.

        :param tracks: Collection of :class:`Track` objects.
        """

        if len(tracks) == 0:
            # No tracks:
            return np.zeros(self.X.shape)

        lon = np.array([])
        lat = np.array([])
        for t in tracks:
            lon = np.append(lon, t.Longitude)
            lat = np.append(lat, t.Latitude)

        xy = np.vstack([self.X.ravel(), self.Y.ravel()])
        data = np.array([[lon], [lat]])

        kde = KDEMultivariate(data, bw='cv_ml', var_type='cc')
        pdf = kde.pdf(data_predict=xy)

        return pdf.reshape(self.X.shape)
示例#11
0
def data_to_pdf(data, coords):
    num_of_variables = 1
    if len(data.shape) > 1:
        num_of_variables = data.shape[1]
    kde = KDEMultivariate(
        data=data, bw='normal_reference', var_type='c' * num_of_variables)
    return kde.pdf(coords)
示例#12
0
def kde_statsmodels_m(x, x_grid, bandwidth=0.2, **kwargs):
    """Multivariate Kernel Density Estimation with Statsmodels"""
    kde = KDEMultivariate(x,
                          bw=bandwidth * np.ones_like(x),
                          var_type='c',
                          **kwargs)
    return kde.pdf(x_grid)
示例#13
0
def kde_statsmodels_m(x, x_grid, bandwidth=0.2, **kwargs):
    
    from statsmodels.nonparametric.kernel_density import KDEMultivariate #for multivariate KDE
    """Multivariate Kernel Density Estimation with Statsmodels"""
    kde = KDEMultivariate(x, bw=np.array(bandwidth * np.ones_like(x)),
                          var_type='c', **kwargs)

    return kde.pdf(x_grid) #return the pdf evaluated at the entries of x_grid
示例#14
0
def data_to_pdf(data, coords):
    num_of_variables = 1
    if len(data.shape) > 1:
        num_of_variables = data.shape[1]
    kde = KDEMultivariate(data=data,
                          bw='normal_reference',
                          var_type='c' * num_of_variables)
    return kde.pdf(coords)
示例#15
0
 def kde_statsmodels_m(self, x_grid, bandwidth=0.2, **kwargs):
     """Multivariate Kernel Density Estimation with
     Statsmodels"""
     from statsmodels.nonparametric.kernel_density import KDEMultivariate
     kde = KDEMultivariate(self.data,
                           bw=bandwidth * np.ones_like(x),
                           var_type='c',
                           **kwargs)
     return kde.pdf(x_grid)
    def histogram(self):
        x = linspace(0, self.maxV, self.resolution)

        v = [
            pp.getLinearVelocity().norm()
            for pp in self.sim.getParticleList()
        ]
        kde = KDEMultivariate(v, bw='normal_reference', var_type='c')
        with open("v-pdf{0:04d}.csv".format(self.count), 'w') as csvfile:
            writer = csv.writer(csvfile, delimiter=' ')
            for n in range(self.resolution):
                writer.writerow([x[n], kde.pdf(x)[n]])
示例#17
0
def kde_xval(bw, args):
    sample = args['x']
    n_folds = args['n_folds']
    var_type = args['var_type']
    losses = []
    for train, test in KFold(n_splits=n_folds).split(sample):
        kde = KDEMultivariate(sample[train], var_type=var_type, bw=[bw])
        pdf = kde.pdf(sample[test])
        logpdf = np.log(pdf)
        logpdfsum = logpdf.sum()
        losses.append(-1 * logpdfsum)
    return np.mean(losses)
示例#18
0
def plot_density_panel(chains, names = None, settings = None):
    '''
    Plot marginal posterior densities

    Args:
        * **chains** (:class:`~numpy.ndarray`): Sampling chain for each parameter
        * **names** (:py:class:`list`): List of strings - name of each parameter
        * **settings** (:py:class:`dict`): Settings for features of this method.
        
    Returns:
        * (:py:class:`tuple`): (figure handle, settings actually used in program)
    '''
    default_settings = {
    'maxpoints': 500,
    'fig': dict(figsize = (5,4), dpi = 100),
    'kde': dict(bw = 'normal_reference', var_type = 'c'),
    'plot': dict(color = 'k', marker = None, linestyle = '-', linewidth = 3),
    'xlabel': {},
    'ylabel': {},
    'hist_on': False,
    'hist': dict(density = True),
    }
    settings = check_settings(default_settings = default_settings, user_settings = settings)
    
    nsimu, nparam = chains.shape # number of rows, number of columns
    ns1, ns2 = generate_subplot_grid(nparam)
    names = generate_names(nparam, names)
    
    f = plt.figure(**settings['fig']) # initialize figure
    for ii in range(nparam):
        # define chain
        chain = chains[:,ii].reshape(nsimu,1) # check indexing
        
        # define x grid
        chain_grid = make_x_grid(chain)
        
        # Compute kernel density estimate
        kde = KDEMultivariate(chain, **settings['kde'])

        # plot density on subplot
        plt.subplot(ns1,ns2,ii+1)
             
        if settings['hist_on'] is True: # include histograms
            hist(chain, **settings['hist'])
            
        plt.plot(chain_grid, kde.pdf(chain_grid), **settings['plot'])
        # format figure
        plt.xlabel(names[ii], **settings['xlabel'])
        plt.ylabel(str('$\pi$({}$|M^{}$)'.format(names[ii], '{data}')), **settings['ylabel'])
        plt.tight_layout(rect=[0, 0.03, 1, 0.95],h_pad=1.0) # adjust spacing

    return f, settings
def normal_pdf_box_vs_point():
    N = 3000
    D = 3
    L = 10
    resolution = 0.01
    vmax = 0.5
    num_of_interals = np.floor(2 * vmax / resolution)
    np.random.seed(1)
    sigma = 0.1
    # Positions will be uniformy distributed
    pos = 2 * L * (np.random.rand(N, D) - 0.5)
    # Velocities will normally distributed
    vel = sigma * np.random.randn(N, D)
    data = np.concatenate((pos, vel), axis=1)
    data_box = box_to_particles(data, x=np.array([0, 0, 0]), a=2)
    data_box = data_box[:, 3]
    print "Number of particles in a box {0}".format(data_box.shape[0])

    # vx, vy = np.mgrid[-vmax:vmax:resolution, -vmax:vmax:resolution]
    vx = np.linspace(-vmax, vmax, num_of_interals)
    pdf_box = data_to_pdf(data_box, vx)

    kde = KDEMultivariate(data=data[:, np.array([0, 1, 2, 3])],
                          bw='normal_reference',
                          var_type='cccc')

    print kde.bw
    dl = resolution
    pdf_point = np.zeros((num_of_interals, 1))
    # Need to calculate integral \int p(vx, vy, x, y, z) dvx dvy
    area = 0
    for n, v in enumerate(vx):
        vv = np.array([v])
        pdf_point[n] = \
            kde.pdf(np.concatenate((np.array([0, 0, 0]), vv), axis=1))
        area += pdf_point[n]
    area *= dl
    pdf_point /= area

    pdf_true = (norm(0, sigma).pdf(vx))

    fig = plt.figure()
    ax = fig.gca()
    l1, = ax.plot(vx, pdf_point)
    l2, = ax.plot(vx, pdf_box)
    l3, = ax.fill(vx, pdf_true, ec='gray', fc='gray', alpha=0.4)
    # cs.set_clim(0, 1.6)

    plt.legend([l1, l2, l3], ["Point approach", "Box approach", "Gaussian"])
    fig.savefig("compare.png", bbox_inches='tight', dpi=300)
    plt.close()
def kde_statsmodels_m_pdf_output(x, x_grid, bandwidth=0.2, **kwargs):
    """Multivariate Kernel Density Estimation with Statsmodels"""
    #kde = KDEMultivariate(x, bw=bandwidth * np.ones_like(x),
    #                      var_type='c', **kwargs)
    #! bw = "cv_ml", "cv_ls", "normal_reference", np.array([0.23])
    kde = None
    while kde == None:
        with warnings.catch_warnings():
            warnings.filterwarnings('ignore')
            try:
                kde = KDEMultivariate(data=x, var_type='c', bw="cv_ml")
                x_grid_sorted = sorted(x_grid)
                pdf = kde.pdf(x_grid_sorted)
            except Warning as e:
                print('error found:', e)
            warnings.filterwarnings('default')

    return pdf, kde.bw
def post_point_stationary_pdf(
        N0=0,
        N=3,
        vmax=1,
        resolution=0.05,
        x=np.array([0, 0, 0])  # position
):
    """
    Return pdf p(v, x)
    """

    data = np.genfromtxt("pdf/VX-{0:04d}.csv".format(N0), delimiter=' ')
    for n in np.arange(N0 + 1, N0 + N):
        data = np.concatenate(
            (data, np.genfromtxt("pdf/VX-{0:04d}.csv".format(n),
                                 delimiter=' ')),
            axis=0)

    print "Number of particles {0}".format(data.shape[0])

    kde = KDEMultivariate(data=data[:, np.array([0, 1, 2, 3, 4])],
                          bw='normal_reference',
                          var_type='ccccc')

    vx, vy = np.mgrid[-vmax:vmax:resolution, -vmax:vmax:resolution]
    dA = resolution**2
    nx = vx.shape[0]
    ny = vx.shape[1]
    pdf = np.zeros((nx, ny))
    # Need to calculate integral \int p(vx, vy, x, y, z) dvx dvy
    area = 0
    for i in range(nx):
        for j in range(ny):
            v = np.array([vx[i, j], vy[i, j]])
            pdf[i, j] = kde.pdf(np.concatenate((x, v), axis=1))
            area += pdf[i, j] * dA

    save_contour_plot(vx,
                      vy,
                      pdf / area,
                      filename="pdfpoint-vxvy.png",
                      title="Point $f^{(1)}(v^{(1)})$",
                      xlabel="Streamwise velocity",
                      ylabel="Spanwise velocity")
示例#22
0
def kde_statsmodels_m(data, grid, **kwargs):
    """
    Multivariate Kernel Density Estimation with Statsmodels

    Parameters
    ----------
    data : numpy.array
        Data points used to compute a density estimator. It
        has `n x p` dimensions, representing n points and p
        variables.
    grid : numpy.array
        Data points at which the desity will be estimated. It
        has `m x p` dimensions, representing m points and p
        variables.

    Returns
    -------
    out : numpy.array
        Density estimate. Has `m x 1` dimensions
    """
    kde = KDEMultivariate(data, **kwargs)
    return kde.pdf(grid)
示例#23
0
def kde_statsmodels_m(data, grid, **kwargs):
    """
    Multivariate Kernel Density Estimation with Statsmodels

    Parameters
    ----------
    data : numpy.array
        Data points used to compute a density estimator. It
        has `n x p` dimensions, representing n points and p
        variables.
    grid : numpy.array
        Data points at which the desity will be estimated. It
        has `m x p` dimensions, representing m points and p
        variables.

    Returns
    -------
    out : numpy.array
        Density estimate. Has `m x 1` dimensions
    """
    kde = KDEMultivariate(data, **kwargs)
    return kde.pdf(grid)
示例#24
0
    def _calculate(self, tracks):
        """
        Calculate a histogram of TC genesis counts given a set of tracks.

        :param tracks: Collection of :class:`Track` objects.
        """
        log.debug("Calculating PDF for set of {0:d} tracks".format(
            len(tracks)))

        hist = ma.zeros((len(self.lon_range) - 1, len(self.lat_range) - 1))

        xy = np.vstack([self.X.ravel(), self.Y.ravel()])

        x = []
        y = []

        for track in tracks:
            if len(track.Longitude) == 0:
                pass
            elif len(track.Longitude) == 1:
                x.append(track.Longitude)
                y.append(track.Latitude)
            else:
                x.append(track.Longitude[0])
                y.append(track.Latitude[0])

        xx = np.array(x)
        yy = np.array(y)
        ii = np.where((xx >= self.gridLimit['xMin'])
                      & (xx <= self.gridLimit['xMax'])
                      & (yy >= self.gridLimit['yMin'])
                      & (yy <= self.gridLimit['yMax']))

        values = np.vstack([xx[ii], yy[ii]])
        kernel = KDEMultivariate(values, bw='cv_ml', var_type='cc')
        pdf = kernel.pdf(data_predict=xy)
        Z = np.reshape(pdf, self.X.shape)
        return Z.T
示例#25
0
class CausalEffect(object):
    def __init__(self, X, causes, effects, admissable_set=[], variable_types=None, expectation=False, density=True):
        """
        We want to calculate the causal effect of X and Y through
        back-door adjustment, P(Y|do(X)) = Sum( P(Y|X,Z)P(Z), Z) 
        for some admissable set of control variables, Z.  First we 
        calculate the conditional density P(Y|X,Z), then the density
        P(Z).  We find the support of Z so we can properly sum over
        it later.  variable_types are a dictionary with the column name
        pointing to an element of set(['o', 'u', 'c']), for 'ordered',
        'unordered discrete', or 'continuous'.
        """
        conditional_density_vars = causes + admissable_set
        self.causes = causes
        self.effects = effects
        self.admissable_set = admissable_set
        self.conditional_density_vars = conditional_density_vars
        
        if variable_types:
            self.variable_types = variable_types
            dep_type      = [variable_types[var] for var in effects]
            indep_type    = [variable_types[var] for var in conditional_density_vars]
            density_types = [variable_types[var] for var in admissable_set]
        else:
            self.variable_types = self.__infer_variable_types(X)

        if 'c' not in variable_types.values():
            bw = 'cv_ml'
        else:
            bw = 'normal_reference'


        if admissable_set:            
            self.density = KDEMultivariate(X[admissable_set], 
                                  var_type=''.join(density_types),
                                  bw=bw)
        
        self.conditional_density = KDEMultivariateConditional(endog=X[effects],
                                                         exog=X[conditional_density_vars],
                                                         dep_type=''.join(dep_type),
                                                         indep_type=''.join(indep_type),
                                                         bw=bw)
        if expectation:
            self.conditional_expectation = KernelReg(X[effects].values,
                                                 X[conditional_density_vars].values,
                                                 ''.join(indep_type),
                                                 bw='cv_ls')

        self.support = self.__get_support(X)
        
        self.discrete_variables = [ variable for variable, var_type in self.variable_types.items() if var_type in ['o', 'u']]
        self.discrete_Z = list(set(self.discrete_variables).intersection(set(admissable_set)))
        self.continuous_variables = [ variable for variable, var_type in self.variable_types.items() if var_type == 'c' ]
        self.continuous_Z = list(set(self.continuous_variables).intersection(set(admissable_set)))
       
 
    def __infer_variable_types(self,X):
        """
        fill this in later.
        """
        pass
       
 
    def __get_support(self, X):
        """
        find the smallest cube around which the densities are supported,
        allowing a little flexibility for variables with larger bandwidths.
        """
        data_support = { variable : (X[variable].min(), X[variable].max()) for variable in X.columns}
        variable_bandwidths = { variable : bw for variable, bw in zip(self.effects + self.conditional_density_vars, self.conditional_density.bw)}
        support = {}
        for variable in self.effects + self.conditional_density_vars:
            if self.variable_types[variable] == 'c':
                lower_support = data_support[variable][0] - 10. * variable_bandwidths[variable]
                upper_support = data_support[variable][1] + 10. * variable_bandwidths[variable]
                support[variable] = (lower_support, upper_support)
            else:
                support[variable] = data_support[variable]
        return support

        
    def integration_function(self,*args):
        # takes continuous z, discrete z, then x
        data = pd.DataFrame({ k : [v] for k, v in zip(self.continuous_Z + self.discrete_Z + self.causes + self.effects, args)})
        conditional = self.conditional_density.pdf(exog_predict=data[self.conditional_density_vars].values[0], 
                                                   endog_predict=data[self.effects].values[0]) 
        density = self.density.pdf(data_predict=data[self.admissable_set])
        return conditional * density

    
    def expectation_integration_function(self, *args):
        data = pd.DataFrame({ k : [v] for k, v in zip(self.continuous_Z + self.discrete_Z + self.causes, args)})
        conditional = self.conditional_expectation.fit(data_predict=data[self.conditional_density_vars].values)[0]
        density = self.density.pdf(data_predict=data[self.admissable_set])
        return conditional * density

    
    def pdf(self, x):
        """
        Currently, this does the whole sum/integral over the cube support of Z.
        We may be able to improve this by taking into account how the joint
        and conditionals factorize, and/or finding a more efficient support.
        
        This should be reasonably fast for |Z| <= 2 or 3, and small enough discrete
        variable cardinalities.  It runs in O(n_1 n_2 ... n_k) in the cardinality of
        the discrete variables, |Z_1| = n_1, etc.  It likewise runs in O(V^n) for n
        continuous Z variables.  Factorizing the joint/conditional distributions in
        the sum could linearize the runtime.
        """
        causal_effect = 0.
        x = x[self.causes + self.effects]
        if self.discrete_Z:
            discrete_variable_ranges = [ xrange(*(int(self.support[variable][0]), int(self.support[variable][1])+1)) for variable in self.discrete_Z]
            for z_vals in itertools.product(*discrete_variable_ranges):
                z_discrete = pd.DataFrame({k : [v] for k, v in zip(self.discrete_Z, z_vals)})
                if self.continuous_Z:
                    continuous_Z_ranges = [self.support[variable] for variable in self.continuous_Z]
                    args = z_discrete.join(x).values[0]
                    causal_effect += nquad(self.integration_function,continuous_Z_ranges,args=args)[0]
                else:
                    z_discrete = z_discrete[self.admissable_set]
                    exog_predictors = x.join(z_discrete)[self.conditional_density_vars]
                    conditional = self.conditional_density.pdf(exog_predict=exog_predictors, 
                                                               endog_predict=x[self.effects]) 
                    density = self.density.pdf(data_predict=z_discrete)
                    dc = conditional * density
                    causal_effect += dc
            return causal_effect
        elif self.continuous_Z:
            continuous_Z_ranges = [self.support[var] for var in self.continuous_Z]
            causal_effect, error = nquad(self.integration_function,continuous_Z_ranges,args=tuple(x.values[0]))
            return causal_effect
        else:
            return self.conditional_density.pdf(exog_predict=x[self.causes],endog_predict=x[self.effects])

       
 
    def expected_value( self, x):
        """
        Currently, this does the whole sum/integral over the cube support of Z.
        We may be able to improve this by taking into account how the joint
        and conditionals factorize, and/or finding a more efficient support.
        
        This should be reasonably fast for |Z| <= 2 or 3, and small enough discrete
        variable cardinalities.  It runs in O(n_1 n_2 ... n_k) in the cardinality of
        the discrete variables, |Z_1| = n_1, etc.  It likewise runs in O(V^n) for n
        continuous Z variables.  Factorizing the joint/conditional distributions in
        the sum could linearize the runtime.
        """
        causal_effect = 0.
        x = x[self.causes]
        if self.discrete_Z:
            discrete_variable_ranges = [ xrange(*(int(self.support[variable][0]), int(self.support[variable][1])+1)) for variable in self.discrete_Z]
            for z_vals in itertools.product(*discrete_variable_ranges):
                z_discrete = pd.DataFrame({k : [v] for k, v in zip(self.discrete_Z, z_vals)})
                if self.continuous_Z:
                    continuous_Z_ranges = [self.support[variable] for variable in self.continuous_Z]
                    args = z_discrete.join(x).values[0]
                    causal_effect += nquad(self.expectation_integration_function,continuous_Z_ranges,args=args)[0]
                else:
                    z_discrete = z_discrete[self.admissable_set]
                    exog_predictors = x.join(z_discrete)[self.conditional_density_vars]
                    causal_effect += self.conditional_expectation.fit(data_predict=exog_predictors.values)[0] * self.density.pdf(data_predict=z_discrete.values)
            return causal_effect
        elif self.continuous_Z:
            continuous_Z_ranges = [self.support[var] for var in self.continuous_Z]
            causal_effect, error = nquad(self.expectation_integration_function,continuous_Z_ranges,args=tuple(x.values[0]))
            return causal_effect
        else:
            return self.conditional_expectation.fit(data_predict=x[self.causes])[0]
示例#26
0
def compute_mutual_information(x,
                               y,
                               z=None,
                               n_grid=25,
                               var_types=None,
                               bandwidth_scaling=None,
                               bandwidths=None,
                               rless=False):
    """
    :param x: array-like, (n_samples,)
    :param y: array-like, (n_samples,)
    :param z: array-like, (n_samples,), optional, variable on which to condition
    :param n_grid: int, number of grid points at which to evaluate kernel density
    :param var_types: three-character string of 'c' (continuous), 'u' (unordered discrete) or 'o' (ordered discrete)
    :param bandwidth_scaling: float
    :return: float, information coefficient
    """
    n = len(x)
    variables = [np.array(x, dtype=float), np.array(y, dtype=float)]
    if z is not None:
        variables.append(np.array(z, dtype=float))
    for v in variables[1:]:
        if len(v) != n:
            raise ValueError("Input arrays have different lengths")
    n_vars = len(variables)
    if var_types is None:
        var_types = ''.join(['c' for _ in range(n_vars)])
        # Todo: guess variable types
    if len(var_types) != n_vars:
        raise ValueError(
            "Number of specified variable types does not match number of variables"
        )
    #print([len(v) for v in variables])
    variables = keep_nonnan_overlap(variables)
    #print([len(v) for v in variables])
    n_overlap = len(variables[0])
    if n_overlap < 2:
        return 0
    variables = add_jitter(variables)
    grids = [np.linspace(v.min(), v.max(), n_grid) for v in variables]
    mesh_grids = np.meshgrid(*grids)
    grid_shape = tuple([n_grid] * n_vars)
    grid = np.vstack([mesh_grid.flatten() for mesh_grid in mesh_grids])
    delta = compute_unspecified_bandwidths(variables, bandwidths, var_types)
    if bandwidth_scaling is not None:
        delta *= bandwidth_scaling
    kde = KDEMultivariate(variables, bw=delta, var_type=var_types)
    p_joint = kde.pdf(grid).reshape(grid_shape) + np.finfo(
        float).eps  # THIS IS THE HOT SPOT. Get faster method
    ds = [grid[1] - grid[0] for grid in grids]
    ds_prod = np.prod(ds)
    p_joint /= (p_joint.sum() * ds_prod)
    h_joint = -np.sum(p_joint * np.log(p_joint)) * ds_prod
    dx = ds[0]
    dy = ds[1]
    if z is None:
        dx = ds[0]
        dy = ds[1]
        px = p_joint.sum(axis=1) * dy
        py = p_joint.sum(axis=0) * dx
        hx = -np.sum(px * np.log(px)) * dx
        hy = -np.sum(py * np.log(py)) * dy
        mi = hx + hy - h_joint
        return mi
    else:
        dz = ds[2]
        pxz = p_joint.sum(axis=1) * dy
        pyz = p_joint.sum(axis=0) * dx
        pz = p_joint.sum(axis=(0, 1)) * dx * dy
        hxz = -np.sum(pxz * np.log(pxz)) * dx * dz
        hyz = -np.sum(pyz * np.log(pyz)) * dy * dz
        hz = -np.sum(pz * np.log(pz)) * dz
        cmi = hxz + hyz - h_joint - hz
        return cmi
示例#27
0
def hdrboxplot(data,
               ncomp=2,
               alpha=None,
               threshold=0.95,
               bw=None,
               xdata=None,
               labels=None,
               ax=None,
               use_brute=False,
               seed=None):
    """
    High Density Region boxplot

    Parameters
    ----------
    data : sequence of ndarrays or 2-D ndarray
        The vectors of functions to create a functional boxplot from.  If a
        sequence of 1-D arrays, these should all be the same size.
        The first axis is the function index, the second axis the one along
        which the function is defined.  So ``data[0, :]`` is the first
        functional curve.
    ncomp : int, optional
        Number of components to use.  If None, returns the as many as the
        smaller of the number of rows or columns in data.
    alpha : list of floats between 0 and 1, optional
        Extra quantile values to compute. Default is None
    threshold : float between 0 and 1, optional
        Percentile threshold value for outliers detection. High value means
        a lower sensitivity to outliers. Default is `0.95`.
    bw: array_like or str, optional
        If an array, it is a fixed user-specified bandwidth. If `None`, set to
        `normal_reference`. If a string, should be one of:

            - normal_reference: normal reference rule of thumb (default)
            - cv_ml: cross validation maximum likelihood
            - cv_ls: cross validation least squares

    xdata : ndarray, optional
        The independent variable for the data. If not given, it is assumed to
        be an array of integers 0..N-1 with N the length of the vectors in
        `data`.
    labels : sequence of scalar or str, optional
        The labels or identifiers of the curves in `data`. If not given,
        outliers are labeled in the plot with array indices.
    ax : Matplotlib AxesSubplot instance, optional
        If given, this subplot is used to plot in instead of a new figure being
        created.
    use_brute : bool
        Use the brute force optimizer instead of the default differential
        evolution to find the curves. Default is False.
    seed : {None, int, np.random.RandomState}
        Seed value to pass to scipy.optimize.differential_evolution. Can be an
        integer or RandomState instance. If None, then the default RandomState
        provided by np.random is used.

    Returns
    -------
    fig : Matplotlib figure instance
        If `ax` is None, the created figure.  Otherwise the figure to which
        `ax` is connected.
    hdr_res : HdrResults instance
        An `HdrResults` instance with the following attributes:

         - 'median', array. Median curve.
         - 'hdr_50', array. 50% quantile band. [sup, inf] curves
         - 'hdr_90', list of array. 90% quantile band. [sup, inf]
            curves.
         - 'extra_quantiles', list of array. Extra quantile band.
            [sup, inf] curves.
         - 'outliers', ndarray. Outlier curves.

    Notes
    -----
    The median curve is the curve with the highest probability on the reduced
    space of a Principal Component Analysis (PCA).

    Outliers are defined as curves that fall outside the band corresponding
    to the quantile given by `threshold`.

    The non-outlying region is defined as the band made up of all the
    non-outlying curves.

    Behind the scene, the dataset is represented as a matrix. Each line
    corresponding to a 1D curve. This matrix is then decomposed using Principal
    Components Analysis (PCA). This allows to represent the data using a finite
    number of modes, or components. This compression process allows to turn the
    functional representation into a scalar representation of the matrix. In
    other words, you can visualize each curve from its components. Each curve
    is thus a point in this reduced space. With 2 components, this is called a
    bivariate plot (2D plot).

    In this plot, if some points are adjacent (similar components), it means
    that back in the original space, the curves are similar. Then, finding the
    median curve means finding the higher density region (HDR) in the reduced
    space. Moreover, the more you get away from this HDR, the more the curve is
    unlikely to be similar to the other curves.

    Using a kernel smoothing technique, the probability density function (PDF)
    of the multivariate space can be recovered. From this PDF, it is possible
    to compute the density probability linked to the cluster of points and plot
    its contours.

    Finally, using these contours, the different quantiles can be extracted
    along with the median curve and the outliers.

    Steps to produce the HDR boxplot include:

    1. Compute a multivariate kernel density estimation
    2. Compute contour lines for quantiles 90%, 50% and `alpha` %
    3. Plot the bivariate plot
    4. Compute median curve along with quantiles and outliers curves.

    References
    ----------
    [1] R.J. Hyndman and H.L. Shang, "Rainbow Plots, Bagplots, and Boxplots for
        Functional Data", vol. 19, pp. 29-45, 2010.

    Examples
    --------
    Load the El Nino dataset.  Consists of 60 years worth of Pacific Ocean sea
    surface temperature data.

    >>> import matplotlib.pyplot as plt
    >>> import statsmodels.api as sm
    >>> data = sm.datasets.elnino.load(as_pandas=False)

    Create a functional boxplot.  We see that the years 1982-83 and 1997-98 are
    outliers; these are the years where El Nino (a climate pattern
    characterized by warming up of the sea surface and higher air pressures)
    occurred with unusual intensity.

    >>> fig = plt.figure()
    >>> ax = fig.add_subplot(111)
    >>> res = sm.graphics.hdrboxplot(data.raw_data[:, 1:],
    ...                              labels=data.raw_data[:, 0].astype(int),
    ...                              ax=ax)

    >>> ax.set_xlabel("Month of the year")
    >>> ax.set_ylabel("Sea surface temperature (C)")
    >>> ax.set_xticks(np.arange(13, step=3) - 1)
    >>> ax.set_xticklabels(["", "Mar", "Jun", "Sep", "Dec"])
    >>> ax.set_xlim([-0.2, 11.2])

    >>> plt.show()

    .. plot:: plots/graphics_functional_hdrboxplot.py

    See Also
    --------
    banddepth, rainbowplot, fboxplot
    """
    fig, ax = utils.create_mpl_ax(ax)

    if labels is None:
        # For use with pandas, get the labels
        if hasattr(data, 'index'):
            labels = data.index
        else:
            labels = np.arange(len(data))

    data = np.asarray(data)
    if xdata is None:
        xdata = np.arange(data.shape[1])

    n_samples, dim = data.shape
    # PCA and bivariate plot
    pca = PCA(data, ncomp=ncomp)
    data_r = pca.factors

    # Create gaussian kernel
    ks_gaussian = KDEMultivariate(data_r,
                                  bw=bw,
                                  var_type='c' * data_r.shape[1])

    # Boundaries of the n-variate space
    bounds = np.array([data_r.min(axis=0), data_r.max(axis=0)]).T

    # Compute contour line of pvalue linked to a given probability level
    if alpha is None:
        alpha = [threshold, 0.9, 0.5]
    else:
        alpha.extend([threshold, 0.9, 0.5])
        alpha = list(set(alpha))
    alpha.sort(reverse=True)

    n_quantiles = len(alpha)
    pdf_r = ks_gaussian.pdf(data_r).flatten()
    pvalues = [
        np.percentile(pdf_r, (1 - alpha[i]) * 100, interpolation='linear')
        for i in range(n_quantiles)
    ]

    # Find mean, outliers curves
    if have_de_optim and not use_brute:
        median = differential_evolution(lambda x: -ks_gaussian.pdf(x),
                                        bounds=bounds,
                                        maxiter=5,
                                        seed=seed).x
    else:
        median = brute(lambda x: -ks_gaussian.pdf(x),
                       ranges=bounds,
                       finish=fmin)

    outliers_idx = np.where(pdf_r < pvalues[alpha.index(threshold)])[0]
    labels_outlier = [labels[i] for i in outliers_idx]
    outliers = data[outliers_idx]

    # Find HDR given some quantiles

    def _band_quantiles(band, use_brute=use_brute, seed=seed):
        """
        Find extreme curves for a quantile band.

        From the `band` of quantiles, the associated PDF extrema values
        are computed. If `min_alpha` is not provided (single quantile value),
        `max_pdf` is set to `1E6` in order not to constrain the problem on high
        values.

        An optimization is performed per component in order to find the min and
        max curves. This is done by comparing the PDF value of a given curve
        with the band PDF.

        Parameters
        ----------
        band : array_like
            alpha values ``(max_alpha, min_alpha)`` ex: ``[0.9, 0.5]``
        use_brute : bool
            Use the brute force optimizer instead of the default differential
            evolution to find the curves. Default is False.
        seed : {None, int, np.random.RandomState}
            Seed value to pass to scipy.optimize.differential_evolution. Can
            be an integer or RandomState instance. If None, then the default
            RandomState provided by np.random is used.


        Returns
        -------
        band_quantiles : list of 1-D array
            ``(max_quantile, min_quantile)`` (2, n_features)

        """
        min_pdf = pvalues[alpha.index(band[0])]
        try:
            max_pdf = pvalues[alpha.index(band[1])]
        except IndexError:
            max_pdf = 1E6
        band = [min_pdf, max_pdf]

        pool = Pool()
        data = zip(
            range(dim),
            itertools.repeat(
                (band, pca, bounds, ks_gaussian, seed, use_brute)))
        band_quantiles = pool.map(_min_max_band, data)
        pool.terminate()
        pool.close()

        band_quantiles = list(zip(*band_quantiles))

        return band_quantiles

    extra_alpha = [
        i for i in alpha if 0.5 != i and 0.9 != i and threshold != i
    ]
    if len(extra_alpha) > 0:
        extra_quantiles = []
        for x in extra_alpha:
            for y in _band_quantiles([x], use_brute=use_brute, seed=seed):
                extra_quantiles.append(y)
    else:
        extra_quantiles = []

    # Inverse transform from n-variate plot to dataset dataset's shape
    median = _inverse_transform(pca, median)[0]
    hdr_90 = _band_quantiles([0.9, 0.5], use_brute=use_brute, seed=seed)
    hdr_50 = _band_quantiles([0.5], use_brute=use_brute, seed=seed)

    hdr_res = HdrResults({
        "median": median,
        "hdr_50": hdr_50,
        "hdr_90": hdr_90,
        "extra_quantiles": extra_quantiles,
        "outliers": outliers,
        "outliers_idx": outliers_idx
    })

    # Plots
    ax.plot(np.array([xdata] * n_samples).T,
            data.T,
            c='c',
            alpha=.1,
            label=None)
    ax.plot(xdata, median, c='k', label='Median')
    fill_betweens = []
    fill_betweens.append(
        ax.fill_between(xdata,
                        *hdr_50,
                        color='gray',
                        alpha=.4,
                        label='50% HDR'))
    fill_betweens.append(
        ax.fill_between(xdata,
                        *hdr_90,
                        color='gray',
                        alpha=.3,
                        label='90% HDR'))

    if len(extra_quantiles) != 0:
        ax.plot(np.array([xdata] * len(extra_quantiles)).T,
                np.array(extra_quantiles).T,
                c='y',
                ls='-.',
                alpha=.4,
                label='Extra quantiles')

    if len(outliers) != 0:
        for ii, outlier in enumerate(outliers):
            if labels_outlier is None:
                label = 'Outliers'
            else:
                label = str(labels_outlier[ii])
            ax.plot(xdata, outlier, ls='--', alpha=0.7, label=label)

    handles, labels = ax.get_legend_handles_labels()

    # Proxy artist for fill_between legend entry
    # See https://matplotlib.org/1.3.1/users/legend_guide.html
    plt = _import_mpl()
    for label, fill_between in zip(['50% HDR', '90% HDR'], fill_betweens):
        p = plt.Rectangle((0, 0), 1, 1, fc=fill_between.get_facecolor()[0])
        handles.append(p)
        labels.append(label)

    by_label = OrderedDict(zip(labels, handles))
    if len(outliers) != 0:
        by_label.pop('Median')
        by_label.pop('50% HDR')
        by_label.pop('90% HDR')

    ax.legend(by_label.values(), by_label.keys(), loc='best')

    return fig, hdr_res
示例#28
0
文件: __init__.py 项目: MaikH88/dclab
    def GetKDE_Scatter(self, yax="Defo", xax="Area", positions=None):
        """ The evaluated Gaussian Kernel Density Estimate
        
        -> for scatter plots
        
        
        Parameters
        ----------
        xax : str
            Identifier for X axis (e.g. "Area", "Area Ratio","Circ",...)
        yax : str
            Identifier for Y axis
        positions : list of points
            The positions where the KDE will be computed. Note that
            the KDE estimate is computed from the the points that
            are set in `self._filter`.
        
        Returns
        -------
        density : 1d ndarray
            The kernel density evaluated for the filtered data points.
        
        
        See Also
        --------
        `RTDC_DataSet.ApplyFilter`
        `scipy.stats.gaussian_kde`
        `statsmodels.nonparametric.kernel_density.KDEMultivariate`
        
        TODO
        ----
        Do not use positions for the hasher. If the plot is filtered
        with marker size, we might end up computing the same KDE for
        the same points over and over again.
        """
        # Dictionary for KDE
        # kernel density estimator
        # Ask Christoph H. about kernel density estimator, he has an other library
        # which allows for manual setting of the bandwidth parameter
        
        kde_type = self.Configuration["Plotting"]["KDE"].lower()
        
        # make sure the density is used for only this set of variables
        key = yax+"+"+xax+"_"+kde_type
        if kde_type == "multivariate":
            bwx = self.Configuration["Plotting"]["KDE Multivariate "+xax]
            bwy = self.Configuration["Plotting"]["KDE Multivariate "+yax]
            key += "_bw{}+{}_".format(bwx,bwy)
        # make sure the density is only used for the same set of
        # filters.
        if self.Configuration["Filtering"]["Enable Filters"]:
            key += str(self.Configuration["Filtering"]).strip("{}")

        if positions is not None:
            # compute hash of positions
            hasher = hashlib.sha256()
            hasher.update(positions)
            key += hasher.hexdigest()
        
        if not self._KDE_Scatter.has_key(key):
            if self.Configuration["Filtering"]["Enable Filters"]:
                x = getattr(self, dfn.cfgmaprev[xax])[self._filter]
                y = getattr(self, dfn.cfgmaprev[yax])[self._filter]
            else:
                x = getattr(self, dfn.cfgmaprev[xax])
                y = getattr(self, dfn.cfgmaprev[yax])
            input_positions = np.vstack([x.ravel(), y.ravel()])
            # Kernel Density estimation
            if kde_type == "gauss":
                a = time.time()
                estimator = gaussian_kde(input_positions)
                if positions is None:
                    positions = input_positions
                density = estimator(positions)
                print("gaussian estimation scatter time: ", time.time()-a)
            elif kde_type == "multivariate":
                a = time.time()
                estimator_ly = KDEMultivariate(data=[x,y],var_type='cc',
                                               bw=[bwx, bwy])
                if positions is None:
                    positions = input_positions
                density = estimator_ly.pdf(positions)
                print("multivariate estimation scatter time: ", time.time()-a)
                
            elif kde_type=="gaussmix":
                if yax=="Defo":
                    xy = np.array([x,np.log(y)]).T
                else:
                    xy = np.array([x,y]).T
            a = time.time()
            clf = mixture.GMM(n_components=np.ceil(bwx), covariance_type='full', \
            random_state=None, thresh=None, min_covar=0.001, n_iter=100, n_init=2, \
            params='wmc', init_params='wmc')
            clf.fit(xy)
            density = np.exp(clf.score_samples(xy)[0])
            print("gaussian mixture scatter time: ", time.time()-a)
            
            else:
                raise ValueError("Unknown KDE estimator {}".format(
                                                              kde_type))
            self._KDE_Scatter[key] = density
示例#29
0
文件: __init__.py 项目: MaikH88/dclab
    def GetKDE_Contour(self, yax="Defo", xax="Area"):
        """ The evaluated Gaussian Kernel Density Estimate
        
        -> for contours
        
        
        Parameters
        ----------
        xax : str
            Identifier for X axis (e.g. "Area", "Area Ratio","Circ",...)
        yax : str
            Identifier for Y axis
        
        
        Returns
        -------
        X, Y, Z : coordinates
            The kernel density Z evaluated on a rectangular grid (X,Y).
        
        See Also
        --------
        `scipy.stats.gaussian_kde`
        `statsmodels.nonparametric.kernel_density.KDEMultivariate`
        """
        if xax is None or yax is None:
            xax, yax = self.GetPlotAxes()
            
        kde_type = self.Configuration["Plotting"]["KDE"].lower()
        # dummy area-circ
        deltaarea = self.Configuration["Plotting"]["Contour Accuracy "+xax]
        deltacirc = self.Configuration["Plotting"]["Contour Accuracy "+yax]

        # kernel density estimator
        # Ask Christoph H. about kernel density estimator, he has an other library
        # which allows for manual setting of the bandwidth parameter
        key = yax+"+"+xax+"_"+kde_type+str(deltaarea)+str(deltacirc)
        
        if kde_type == "multivariate":
            bwx = self.Configuration["Plotting"]["KDE Multivariate "+xax]
            bwy = self.Configuration["Plotting"]["KDE Multivariate "+yax]
            key += "_bw{}+{}_".format(bwx,bwy)

        # make sure the density is only used for the same set of
        # filters.
        if self.Configuration["Filtering"]["Enable Filters"]:
            key += str(self.Configuration["Filtering"]).strip("{}")

        if not self._KDE_Contour.has_key(key):
            # setup
            if self.Configuration["Filtering"]["Enable Filters"]:
                x = getattr(self, dfn.cfgmaprev[xax])[self._filter]
                y = getattr(self, dfn.cfgmaprev[yax])[self._filter]
            else:
                x = getattr(self, dfn.cfgmaprev[xax])
                y = getattr(self, dfn.cfgmaprev[yax])
            # evaluation
            xlin = np.arange(x.min(), x.max(), deltaarea)
            ylin = np.arange(y.min(), y.max(), deltacirc)
            Xmesh,Ymesh = np.meshgrid(xlin,ylin)
            X = Xmesh.ravel()
            Y = Ymesh.ravel()
            if kde_type == "gauss":
                estimator = gaussian_kde([x,y])
                Z = estimator.evaluate([X,Y]).reshape(len(ylin),len(xlin))
            elif kde_type == "multivariate":
                estimator_ly = KDEMultivariate(data=[x,y],var_type='cc',
                                               bw=[bwx, bwy])
                Z = estimator_ly.pdf([X,Y]).reshape(len(ylin),len(xlin))
            elif kde_type=="gaussmix":
                if yax=="Defo":
                    xy = np.array([x,np.log(y)]).T
                    XY = np.array([X,np.log(Y)]).T
                else:
                    XY = np.array([X,Y]).T
                    xy = np.array([x,y]).T
                clf = mixture.GMM(n_components=np.ceil(bwx), covariance_type='full', \
                random_state=None, thresh=None, min_covar=0.001, n_iter=100, n_init=2, \
                params='wmc', init_params='wmc')
                clf.fit(xy)
                Z = np.exp(clf.score_samples(XY)[0]).reshape(len(ylin),len(xlin))

            else:
                raise ValueError("Unknown KDE estimator {}".format(
                                                              kde_type))                
            self._KDE_Contour[key] = (Xmesh,Ymesh,Z)
        return self._KDE_Contour[key]
示例#30
0
def hdrboxplot(data, ncomp=2, alpha=None, threshold=0.95, bw=None,
               xdata=None, labels=None, ax=None):
    """
    High Density Region boxplot

    Parameters
    ----------
    data : sequence of ndarrays or 2-D ndarray
        The vectors of functions to create a functional boxplot from.  If a
        sequence of 1-D arrays, these should all be the same size.
        The first axis is the function index, the second axis the one along
        which the function is defined.  So ``data[0, :]`` is the first
        functional curve.
    ncomp : int, optional
        Number of components to use.  If None, returns the as many as the
        smaller of the number of rows or columns in data.
    alpha : list of floats between 0 and 1, optional
        Extra quantile values to compute. Default is None
    threshold : float between 0 and 1, optional
        Percentile threshold value for outliers detection. High value means
        a lower sensitivity to outliers. Default is `0.95`.
    bw: array_like or str, optional
        If an array, it is a fixed user-specified bandwidth. If `None`, set to
        `normal_reference`. If a string, should be one of:

            - normal_reference: normal reference rule of thumb (default)
            - cv_ml: cross validation maximum likelihood
            - cv_ls: cross validation least squares

    xdata : ndarray, optional
        The independent variable for the data. If not given, it is assumed to
        be an array of integers 0..N-1 with N the length of the vectors in
        `data`.
    labels : sequence of scalar or str, optional
        The labels or identifiers of the curves in `data`. If not given,
        outliers are labeled in the plot with array indices.
    ax : Matplotlib AxesSubplot instance, optional
        If given, this subplot is used to plot in instead of a new figure being
        created.

    Returns
    -------
    fig : Matplotlib figure instance
        If `ax` is None, the created figure.  Otherwise the figure to which
        `ax` is connected.
    hdr_res : HdrResults instance
        An `HdrResults` instance with the following attributes:

         - 'median', array. Median curve.
         - 'hdr_50', array. 50% quantile band. [sup, inf] curves
         - 'hdr_90', list of array. 90% quantile band. [sup, inf]
            curves.
         - 'extra_quantiles', list of array. Extra quantile band.
            [sup, inf] curves.
         - 'outliers', ndarray. Outlier curves.

    Notes
    -----
    The median curve is the curve with the highest probability on the reduced
    space of a Principal Component Analysis (PCA).

    Outliers are defined as curves that fall outside the band corresponding
    to the quantile given by `threshold`.

    The non-outlying region is defined as the band made up of all the
    non-outlying curves.

    Behind the scene, the dataset is represented as a matrix. Each line
    corresponding to a 1D curve. This matrix is then decomposed using Principal
    Components Analysis (PCA). This allows to represent the data using a finite
    number of modes, or components. This compression process allows to turn the
    functional representation into a scalar representation of the matrix. In
    other words, you can visualize each curve from its components. Each curve
    is thus a point in this reduced space. With 2 components, this is called a
    bivariate plot (2D plot).

    In this plot, if some points are adjacent (similar components), it means
    that back in the original space, the curves are similar. Then, finding the
    median curve means finding the higher density region (HDR) in the reduced
    space. Moreover, the more you get away from this HDR, the more the curve is
    unlikely to be similar to the other curves.

    Using a kernel smoothing technique, the probability density function (PDF)
    of the multivariate space can be recovered. From this PDF, it is possible to
    compute the density probability linked to the cluster of points and plot
    its contours.

    Finally, using these contours, the different quantiles can be extracted
    along with the median curve and the outliers.

    Steps to produce the HDR boxplot include:

    1. Compute a multivariate kernel density estimation
    2. Compute contour lines for quantiles 90%, 50% and `alpha` %
    3. Plot the bivariate plot
    4. Compute median curve along with quantiles and outliers curves.

    References
    ----------
    [1] R.J. Hyndman and H.L. Shang, "Rainbow Plots, Bagplots, and Boxplots for
        Functional Data", vol. 19, pp. 29-45, 2010.

    Examples
    --------
    Load the El Nino dataset.  Consists of 60 years worth of Pacific Ocean sea
    surface temperature data.

    >>> import matplotlib.pyplot as plt
    >>> import statsmodels.api as sm
    >>> data = sm.datasets.elnino.load(as_pandas=False)

    Create a functional boxplot.  We see that the years 1982-83 and 1997-98 are
    outliers; these are the years where El Nino (a climate pattern
    characterized by warming up of the sea surface and higher air pressures)
    occurred with unusual intensity.

    >>> fig = plt.figure()
    >>> ax = fig.add_subplot(111)
    >>> res = sm.graphics.hdrboxplot(data.raw_data[:, 1:],
    ...                              labels=data.raw_data[:, 0].astype(int),
    ...                              ax=ax)

    >>> ax.set_xlabel("Month of the year")
    >>> ax.set_ylabel("Sea surface temperature (C)")
    >>> ax.set_xticks(np.arange(13, step=3) - 1)
    >>> ax.set_xticklabels(["", "Mar", "Jun", "Sep", "Dec"])
    >>> ax.set_xlim([-0.2, 11.2])

    >>> plt.show()

    .. plot:: plots/graphics_functional_hdrboxplot.py

    See Also
    --------
    banddepth, rainbowplot, fboxplot
    """
    fig, ax = utils.create_mpl_ax(ax)

    if labels is None:
        # For use with pandas, get the labels
        if hasattr(data, 'index'):
            labels = data.index
        else:
            labels = np.arange(len(data))

    data = np.asarray(data)
    if xdata is None:
        xdata = np.arange(data.shape[1])

    n_samples, dim = data.shape
    # PCA and bivariate plot
    pca = PCA(data, ncomp=ncomp)
    data_r = pca.factors

    # Create gaussian kernel
    ks_gaussian = KDEMultivariate(data_r, bw=bw,
                                  var_type='c' * data_r.shape[1])

    # Boundaries of the n-variate space
    bounds = np.array([data_r.min(axis=0), data_r.max(axis=0)]).T

    # Compute contour line of pvalue linked to a given probability level
    if alpha is None:
        alpha = [threshold, 0.9, 0.5]
    else:
        alpha.extend([threshold, 0.9, 0.5])
        alpha = list(set(alpha))
    alpha.sort(reverse=True)

    n_quantiles = len(alpha)
    pdf_r = ks_gaussian.pdf(data_r).flatten()
    pvalues = [np.percentile(pdf_r, (1 - alpha[i]) * 100,
                             interpolation='linear')
               for i in range(n_quantiles)]

    # Find mean, outliers curves
    if have_de_optim:
        median = differential_evolution(lambda x: - ks_gaussian.pdf(x),
                                        bounds=bounds, maxiter=5).x
    else:
        median = brute(lambda x: - ks_gaussian.pdf(x),
                       ranges=bounds, finish=fmin)

    outliers_idx = np.where(pdf_r < pvalues[alpha.index(threshold)])[0]
    labels_outlier = [labels[i] for i in outliers_idx]
    outliers = data[outliers_idx]

    # Find HDR given some quantiles

    def _band_quantiles(band):
        """Find extreme curves for a quantile band.

        From the `band` of quantiles, the associated PDF extrema values
        are computed. If `min_alpha` is not provided (single quantile value),
        `max_pdf` is set to `1E6` in order not to constrain the problem on high
        values.

        An optimization is performed per component in order to find the min and
        max curves. This is done by comparing the PDF value of a given curve
        with the band PDF.

        Parameters
        ----------
        band : array_like
            alpha values ``(max_alpha, min_alpha)`` ex: ``[0.9, 0.5]``

        Returns
        -------
        band_quantiles : list of 1-D array
            ``(max_quantile, min_quantile)`` (2, n_features)

        """
        min_pdf = pvalues[alpha.index(band[0])]
        try:
            max_pdf = pvalues[alpha.index(band[1])]
        except IndexError:
            max_pdf = 1E6
        band = [min_pdf, max_pdf]

        pool = Pool()
        data = zip(range(dim), itertools.repeat((band, pca,
                                                 bounds, ks_gaussian)))
        band_quantiles = pool.map(_min_max_band, data)
        pool.terminate()
        pool.close()

        band_quantiles = list(zip(*band_quantiles))

        return band_quantiles

    extra_alpha = [i for i in alpha
                   if 0.5 != i and 0.9 != i and threshold != i]
    if extra_alpha != []:
            extra_quantiles = [y for x in extra_alpha
                               for y in _band_quantiles([x])]
    else:
        extra_quantiles = []

    # Inverse transform from n-variate plot to dataset dataset's shape
    median = _inverse_transform(pca, median)[0]
    hdr_90 = _band_quantiles([0.9, 0.5])
    hdr_50 = _band_quantiles([0.5])

    hdr_res = HdrResults({
                            "median": median,
                            "hdr_50": hdr_50,
                            "hdr_90": hdr_90,
                            "extra_quantiles": extra_quantiles,
                            "outliers": outliers,
                            "outliers_idx": outliers_idx
                         })

    # Plots
    ax.plot(np.array([xdata] * n_samples).T, data.T,
            c='c', alpha=.1, label=None)
    ax.plot(xdata, median, c='k', label='Median')
    fill_betweens = []
    fill_betweens.append(ax.fill_between(xdata, *hdr_50, color='gray',
                                         alpha=.4,  label='50% HDR'))
    fill_betweens.append(ax.fill_between(xdata, *hdr_90, color='gray',
                                         alpha=.3, label='90% HDR'))

    if len(extra_quantiles) != 0:
        ax.plot(np.array([xdata] * len(extra_quantiles)).T,
                np.array(extra_quantiles).T,
                c='y', ls='-.', alpha=.4, label='Extra quantiles')

    if len(outliers) != 0:
        for ii, outlier in enumerate(outliers):
            label = str(labels_outlier[ii]) if labels_outlier is not None else 'Outliers'
            ax.plot(xdata, outlier,
                    ls='--', alpha=0.7, label=label)

    handles, labels = ax.get_legend_handles_labels()

    # Proxy artist for fill_between legend entry
    # See http://matplotlib.org/1.3.1/users/legend_guide.html
    plt = _import_mpl()
    for label, fill_between in zip(['50% HDR', '90% HDR'], fill_betweens):
        p = plt.Rectangle((0, 0), 1, 1,
                          fc=fill_between.get_facecolor()[0])
        handles.append(p)
        labels.append(label)

    by_label = OrderedDict(zip(labels, handles))
    if len(outliers) != 0:
        by_label.pop('Median')
        by_label.pop('50% HDR')
        by_label.pop('90% HDR')

    ax.legend(by_label.values(), by_label.keys(), loc='best')

    return fig, hdr_res
示例#31
0
class CausalEffect(object):
    def __init__(self,
                 X,
                 causes,
                 effects,
                 admissable_set=[],
                 variable_types=None,
                 expectation=False,
                 density=True):
        """
        We want to calculate the causal effect of X and Y through
        back-door adjustment, P(Y|do(X)) = Sum( P(Y|X,Z)P(Z), Z) 
        for some admissable set of control variables, Z.  First we 
        calculate the conditional density P(Y|X,Z), then the density
        P(Z).  We find the support of Z so we can properly sum over
        it later.  variable_types are a dictionary with the column name
        pointing to an element of set(['o', 'u', 'c']), for 'ordered',
        'unordered discrete', or 'continuous'.
        """
        conditional_density_vars = causes + admissable_set
        self.causes = causes
        self.effects = effects
        self.admissable_set = admissable_set
        self.conditional_density_vars = conditional_density_vars

        if variable_types:
            self.variable_types = variable_types
            dep_type = [variable_types[var] for var in effects]
            indep_type = [
                variable_types[var] for var in conditional_density_vars
            ]
            density_types = [variable_types[var] for var in admissable_set]
        else:
            self.variable_types = self.__infer_variable_types(X)

        if 'c' not in variable_types.values():
            bw = 'cv_ml'
        else:
            bw = 'normal_reference'

        if admissable_set:
            self.density = KDEMultivariate(X[admissable_set],
                                           var_type=''.join(density_types),
                                           bw=bw)

        self.conditional_density = KDEMultivariateConditional(
            endog=X[effects],
            exog=X[conditional_density_vars],
            dep_type=''.join(dep_type),
            indep_type=''.join(indep_type),
            bw=bw)
        if expectation:
            self.conditional_expectation = KernelReg(
                X[effects].values,
                X[conditional_density_vars].values,
                ''.join(indep_type),
                bw='cv_ls')

        self.support = self.__get_support(X)

        self.discrete_variables = [
            variable for variable, var_type in self.variable_types.items()
            if var_type in ['o', 'u']
        ]
        self.discrete_Z = list(
            set(self.discrete_variables).intersection(set(admissable_set)))
        self.continuous_variables = [
            variable for variable, var_type in self.variable_types.items()
            if var_type == 'c'
        ]
        self.continuous_Z = list(
            set(self.continuous_variables).intersection(set(admissable_set)))

    def __infer_variable_types(self, X):
        """
        fill this in later.
        """
        pass

    def __get_support(self, X):
        """
        find the smallest cube around which the densities are supported,
        allowing a little flexibility for variables with larger bandwidths.
        """
        data_support = {
            variable: (X[variable].min(), X[variable].max())
            for variable in X.columns
        }
        variable_bandwidths = {
            variable: bw
            for variable, bw in zip(
                self.effects +
                self.conditional_density_vars, self.conditional_density.bw)
        }
        support = {}
        for variable in self.effects + self.conditional_density_vars:
            if self.variable_types[variable] == 'c':
                lower_support = data_support[variable][
                    0] - 10. * variable_bandwidths[variable]
                upper_support = data_support[variable][
                    1] + 10. * variable_bandwidths[variable]
                support[variable] = (lower_support, upper_support)
            else:
                support[variable] = data_support[variable]
        return support

    def integration_function(self, *args):
        # takes continuous z, discrete z, then x
        data = pd.DataFrame({
            k: [v]
            for k, v in zip(
                self.continuous_Z + self.discrete_Z + self.causes +
                self.effects, args)
        })
        conditional = self.conditional_density.pdf(
            exog_predict=data[self.conditional_density_vars].values[0],
            endog_predict=data[self.effects].values[0])
        density = self.density.pdf(data_predict=data[self.admissable_set])
        return conditional * density

    def expectation_integration_function(self, *args):
        data = pd.DataFrame({
            k: [v]
            for k, v in zip(self.continuous_Z + self.discrete_Z +
                            self.causes, args)
        })
        conditional = self.conditional_expectation.fit(
            data_predict=data[self.conditional_density_vars].values)[0]
        density = self.density.pdf(data_predict=data[self.admissable_set])
        return conditional * density

    def pdf(self, x):
        """
        Currently, this does the whole sum/integral over the cube support of Z.
        We may be able to improve this by taking into account how the joint
        and conditionals factorize, and/or finding a more efficient support.
        
        This should be reasonably fast for |Z| <= 2 or 3, and small enough discrete
        variable cardinalities.  It runs in O(n_1 n_2 ... n_k) in the cardinality of
        the discrete variables, |Z_1| = n_1, etc.  It likewise runs in O(V^n) for n
        continuous Z variables.  Factorizing the joint/conditional distributions in
        the sum could linearize the runtime.
        """
        causal_effect = 0.
        x = x[self.causes + self.effects]
        if self.discrete_Z:
            discrete_variable_ranges = [
                xrange(*(int(self.support[variable][0]),
                         int(self.support[variable][1]) + 1))
                for variable in self.discrete_Z
            ]
            for z_vals in itertools.product(*discrete_variable_ranges):
                z_discrete = pd.DataFrame(
                    {k: [v]
                     for k, v in zip(self.discrete_Z, z_vals)})
                if self.continuous_Z:
                    continuous_Z_ranges = [
                        self.support[variable]
                        for variable in self.continuous_Z
                    ]
                    args = z_discrete.join(x).values[0]
                    causal_effect += nquad(self.integration_function,
                                           continuous_Z_ranges,
                                           args=args)[0]
                else:
                    z_discrete = z_discrete[self.admissable_set]
                    exog_predictors = x.join(z_discrete)[
                        self.conditional_density_vars]
                    conditional = self.conditional_density.pdf(
                        exog_predict=exog_predictors,
                        endog_predict=x[self.effects])
                    density = self.density.pdf(data_predict=z_discrete)
                    dc = conditional * density
                    causal_effect += dc
            return causal_effect
        elif self.continuous_Z:
            continuous_Z_ranges = [
                self.support[var] for var in self.continuous_Z
            ]
            causal_effect, error = nquad(self.integration_function,
                                         continuous_Z_ranges,
                                         args=tuple(x.values[0]))
            return causal_effect
        else:
            return self.conditional_density.pdf(exog_predict=x[self.causes],
                                                endog_predict=x[self.effects])

    def expected_value(self, x):
        """
        Currently, this does the whole sum/integral over the cube support of Z.
        We may be able to improve this by taking into account how the joint
        and conditionals factorize, and/or finding a more efficient support.
        
        This should be reasonably fast for |Z| <= 2 or 3, and small enough discrete
        variable cardinalities.  It runs in O(n_1 n_2 ... n_k) in the cardinality of
        the discrete variables, |Z_1| = n_1, etc.  It likewise runs in O(V^n) for n
        continuous Z variables.  Factorizing the joint/conditional distributions in
        the sum could linearize the runtime.
        """
        causal_effect = 0.
        x = x[self.causes]
        if self.discrete_Z:
            discrete_variable_ranges = [
                xrange(*(int(self.support[variable][0]),
                         int(self.support[variable][1]) + 1))
                for variable in self.discrete_Z
            ]
            for z_vals in itertools.product(*discrete_variable_ranges):
                z_discrete = pd.DataFrame(
                    {k: [v]
                     for k, v in zip(self.discrete_Z, z_vals)})
                if self.continuous_Z:
                    continuous_Z_ranges = [
                        self.support[variable]
                        for variable in self.continuous_Z
                    ]
                    args = z_discrete.join(x).values[0]
                    causal_effect += nquad(
                        self.expectation_integration_function,
                        continuous_Z_ranges,
                        args=args)[0]
                else:
                    z_discrete = z_discrete[self.admissable_set]
                    exog_predictors = x.join(z_discrete)[
                        self.conditional_density_vars]
                    causal_effect += self.conditional_expectation.fit(
                        data_predict=exog_predictors.values
                    )[0] * self.density.pdf(data_predict=z_discrete.values)
            return causal_effect
        elif self.continuous_Z:
            continuous_Z_ranges = [
                self.support[var] for var in self.continuous_Z
            ]
            causal_effect, error = nquad(self.expectation_integration_function,
                                         continuous_Z_ranges,
                                         args=tuple(x.values[0]))
            return causal_effect
        else:
            return self.conditional_expectation.fit(
                data_predict=x[self.causes])[0]
示例#32
0
def modeKDE(arr):
    kde = KDEMultivariate(arr, var_type="c", bw="cv_ml")
    pdf = kde.pdf(np.linspace(min(arr), max(arr), (max(arr)-np.min(arr))*25))
    return max(pdf), kde.bw
from statsmodels.nonparametric.kernel_density import KDEMultivariate
import numpy as np

# Check what happens when we have categorical 1, 2 and we request probability of categorical 0
data = np.array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]).reshape([11, 1])


def bw(data):
    X = np.std(data, axis=0)
    nobs = data.shape[0]

    return 1.06 * X * nobs**(-1. / (4 + data.shape[1]))


kde_e = KDEMultivariate(data, bw=np.array([0.5]), var_type='u')

print(bw(data))
print(kde_e.bw)

print(kde_e.pdf([[2.]]))
# test_point = np.array([[0.]])
#
# print(kde_e.data)
# print(kde_e.pdf(test_point))
示例#34
0
def plot_density_panel(chains,
                       names=None,
                       settings=None,
                       return_kde=False,
                       hist_on=False,
                       return_settings=False):
    '''
    Plot marginal posterior densities

    Args:
        * **chains** (:class:`~numpy.ndarray`): Sampling chain \
        for each parameter

    Kwargs:
        * **names** (:py:class:`list`): List of strings - name \
        of each parameter. Default: `None`
        * **settings** (:py:class:`dict`): Settings for features \
        of this method. Default: `None`
        * **return_kde** (:py:class:`bool`): Flag to return \
          handles of functions from KDE. Default: `False`
        * **return_settings** (:py:class:`bool`): Flag to return \
        figure settings. Default: `False`
        * **hist_on** (:py:class:`bool`): Flag to include histogram \
          on plot with marginal distribution.

    Returns:
        * If `return_settings=True` and `return_kde=True`, \
          (:py:class:`tuple`): (figure handle, settings used, \
          kde handles)
        * If `return_settings=True` and `return_kde=False`, \
          (:py:class:`tuple`): (figure handle, settings used)
        * If `return_settings=False` and `return_kde=True`, \
          (:py:class:`tuple`): (figure handle, kde handles)
        * Otherwise, figure handle
    '''
    default_settings = {
        'maxpoints': 500,
        'fig': dict(figsize=(5, 4), dpi=100),
        'kde': dict(bw='normal_reference', var_type='c'),
        'plot': dict(color='k', marker=None, linestyle='-', linewidth=3),
        'xlabel': {},
        'ylabel': {},
        'hist': dict(density=True),
    }
    settings = check_settings(default_settings=default_settings,
                              user_settings=settings)
    nsimu, nparam = chains.shape  # number of rows, number of columns
    ns1, ns2 = generate_subplot_grid(nparam)
    names = generate_names(nparam, names)
    kdehandle = []
    f = plt.figure(**settings['fig'])  # initialize figure
    for ii in range(nparam):
        # define chain
        chain = chains[:, ii].reshape(nsimu, 1)  # check indexing
        # define x grid
        chain_grid = make_x_grid(chain)
        # Compute kernel density estimate
        kde = KDEMultivariate(chain, **settings['kde'])
        # plot density on subplot
        plt.subplot(ns1, ns2, ii + 1)
        if hist_on is True:  # include histograms
            hist(chain, **settings['hist'])
        plt.plot(chain_grid, kde.pdf(chain_grid), **settings['plot'])
        # format figure
        plt.xlabel(names[ii], **settings['xlabel'])
        plt.ylabel(str('$\\pi$({}$|M^{}$)'.format(names[ii], '{data}')),
                   **settings['ylabel'])
        plt.tight_layout(rect=[0, 0.03, 1, 0.95], h_pad=1.0)  # adjust spacing
        kdehandle.append(kde)
    # setup output
    if return_kde is True and return_settings is True:
        return f, settings, kdehandle
    elif return_kde is True and return_settings is False:
        return f, kdehandle
    elif return_kde is False and return_settings is True:
        return f, settings
    else:
        return f
示例#35
0
    y =  (np.exp(-(np.log(x)-x0)**2/(2*sig**2))/(x*sig*np.sqrt(2*np.pi)))
    return x,y

data1 = np.loadtxt('Data1.txt')
data2 = np.loadtxt('Data2.txt')

#Calculate bandwidth with Cross Validation Least Seuqares
dens1 = KDEMultivariate(data=[data1], var_type='c', bw='cv_ls')
dens2 = KDEMultivariate(data=[data2], var_type='c', bw='cv_ls')
#Calculate bandwidth with Silverman's rule of thumb
bw1 = np.std(data1)*(4./(3.*len(data1)))**(1./5.)
bw2 = np.std(data2)*(4./(3.*len(data2)))**(1./5.)

#Analyzing Data 1: KDE, Parent distribution, std and mean
x_grid1 = np.linspace(0,70,1000)
pdf1 = dens1.pdf(x_grid1)
mean1, std1 = np.mean(data1),np.std(data1)
x1, y1 = gauss(std1,mean1)
p1 = normal_ad(data1)[1]
mean_kde1, std_kde1 = np.mean(pdf1),np.std(pdf1)
#Analyzing Data 2: KDE, Parent distribution, std and mean
x_grid2 = np.linspace(0,70,1000)
pdf2 = dens2.pdf(x_grid2)
mean2, std2 = np.mean(data2),np.std(data2)
x2,y2 = lognormal(0.2,1.0)
p2 = ks_2samp(y2,data2)[1]
mean_kde2, std_kde2 = np.mean(pdf2),np.std(pdf2)


#Plot the histograms, the parent distributions and the KDEs
plt.ion()
示例#36
0
def multivariate_statsmodels():
    f = lambda x, y: -(x / 50)**2 + y**2 + 1
    uniform_x = np.linspace(-50, 50, 1000)
    uniform_y = np.linspace(-1, 1, 1000)
    uniform_x_y_pairs = [(x, y) for y in uniform_y for x in uniform_x]
    uniform_z = [f(x, y) for x, y in uniform_x_y_pairs]
    distribution = np.array(
        random.choices(uniform_x_y_pairs, weights=uniform_z, k=1000))

    # uniform 3D plot
    fig = plt.figure()
    ax = Axes3D(fig)
    X, Y = np.meshgrid(uniform_x, uniform_y)
    Z = np.array(uniform_z).reshape(len(uniform_x), len(uniform_y))
    ax.set_xlabel('x')
    ax.set_ylabel('y')
    ax.set_zlabel('z')
    ax.plot_surface(X, Y, Z)

    # Histograms suck!
    plt.figure()
    zipped_dist = list(zip(*distribution))
    H, xedges, yedges = np.histogram2d(zipped_dist[0],
                                       zipped_dist[1],
                                       bins=(np.linspace(-50, 50, 100),
                                             np.linspace(-1, 1, 100)))
    plt.imshow(np.transpose(H),
               origin='low',
               aspect='auto',
               interpolation='catrom',
               extent=[xedges[0], xedges[-1], yedges[0], yedges[-1]],
               cmap='magma')
    plt.colorbar()

    # kernel density estimate
    resampled_x = np.linspace(-50, 50, 100)
    resampled_y = np.linspace(-1, 1, 100)
    resampled_points = np.array([(x, y) for y in resampled_y
                                 for x in resampled_x])

    # log_density = KernelDensity(kernel='epanechnikov', bandwidth=0.1).fit(distribution).score_samples(resampled_points)
    # density = np.exp(log_density)
    # Z = np.array(density).reshape(len(resampled_x), len(resampled_y))

    # kde = KDEMultivariate(distribution, bw='cv_ls', var_type='uu')
    # print('cv_ls')
    # print(kde.bw)
    # print(kde.loo_likelihood(kde.bw))
    # Z = kde.pdf(resampled_points).reshape(len(resampled_x), len(resampled_y))
    # plt.figure()
    # plt.imshow(Z, origin='low', aspect='auto',
    # 	interpolation='catrom',
    # 	extent=[resampled_x[0], resampled_x[-1], resampled_y[0], resampled_y[-1]],
    # 	cmap='magma')
    # plt.title('cv_ls')
    # plt.colorbar()
    # kde = KDEMultivariate(distribution, bw='cv_ml', var_type='uu')
    # print('cv_ml')
    # print(kde.bw)
    # print(kde.loo_likelihood(kde.bw))
    # Z = kde.pdf(resampled_points).reshape(len(resampled_x), len(resampled_y))
    # plt.figure()
    # plt.imshow(Z, origin='low', aspect='auto',
    # 	interpolation='catrom',
    # 	extent=[resampled_x[0], resampled_x[-1], resampled_y[0], resampled_y[-1]],
    # 	cmap='magma')
    # plt.title('cv_ml')
    # plt.colorbar()
    kde = KDEMultivariate(distribution, bw=[5, 0.1], var_type='uu')
    print([5, 0.1])
    print(kde.bw)
    print(kde.loo_likelihood(kde.bw))
    Z = kde.pdf(resampled_points).reshape(len(resampled_x), len(resampled_y))
    plt.figure()
    plt.imshow(Z,
               origin='low',
               aspect='auto',
               interpolation='catrom',
               extent=[
                   resampled_x[0], resampled_x[-1], resampled_y[0],
                   resampled_y[-1]
               ],
               cmap='magma')
    plt.title(str([5, 0.1]))
    plt.colorbar()

    search_min_x, search_max_x, search_num_each_axis = 0.001, 50, 7
    search_min_y, search_max_y = 0.001, 1
    assert search_num_each_axis >= 4
    while abs(search_min_x - search_max_x) > 0.01 or abs(search_min_y -
                                                         search_max_y) > 0.01:
        search_space_x = np.linspace(search_min_x, search_max_x,
                                     search_num_each_axis)
        search_space_y = np.linspace(search_min_y, search_max_y,
                                     search_num_each_axis)
        search_space = np.array([(x, y) for y in search_space_y
                                 for x in search_space_x])
        print_3d_array_as_columns(
            search_space.reshape(len(search_space_x), len(search_space_y), 2))
        print('searching x', search_space_x)
        print('searching y', search_space_y)
        # evaluate search space
        likelihoods = []
        for i, bandwidth_estimate in enumerate(search_space):
            print('processing', i + 1, 'of', len(search_space), end='\r')
            kde = KDEMultivariate(distribution,
                                  bw=bandwidth_estimate,
                                  var_type='uu')
            likelihood = kde.loo_likelihood(kde.bw)
            likelihoods.append(likelihood)

        print(
            np.array(list(zip(search_space, np.array(likelihoods)))).reshape(
                len(search_space_x), len(search_space_y), 3))
        maximum_likelihood = max(likelihoods)
        i = likelihoods.index(maximum_likelihood)
        bandwidth = search_space[i]
        # define next search space
        i_x = int(np.where(search_space_x == bandwidth[0])[0])
        if i_x == 0:
            search_min_x, search_max_x = search_space_x[:2]
        elif i_x == len(search_space_x) - 1:
            search_min_x, search_max_x = search_space_x[-2:]
        else:
            search_min_x, _, search_max_x = search_space_x[i_x - 1:i_x + 2]
        i_y = int(np.where(search_space_y == bandwidth[1])[0])
        if i_y == 0:
            search_min_y, search_max_y = search_space_y[:2]
        elif i_y == len(search_space_y) - 1:
            search_min_y, search_max_y = search_space_y[-2:]
        else:
            search_min_y, _, search_max_y = search_space_y[i_y - 1:i_y + 2]
        print('recursing on', bandwidth)
    print('settled on', bandwidth)
    kde = KDEMultivariate(distribution, bw=bandwidth, var_type='uu')
    likelihood = kde.loo_likelihood(kde.bw)
    print('likelihood', likelihood)
    Z = kde.pdf(resampled_points).reshape(len(resampled_x), len(resampled_y))

    # class fake_scikit_learn_kde():
    # 	def __init__(self, var_type, bw):
    # 		self.var_type = var_type
    # 		self.bw = bw
    # 	def fit(distribution):
    # 		return KDEMultivariate(distribution, var_type=self.var_type)
    # bandwidth_estimator = GridSearchCV(KDEMultivariate(distribution, var_type='uu'), {'bw': [[x, 0.1] for x in np.linspace(0.01, 10, 5)]})
    # bandwidth_estimator.fit(distribution)
    # print(bandwidth_estimator.best_params_)
    # Z = bandwidth_estimator.best_estimator_.pdf(resampled_points)

    plt.figure()
    plt.imshow(Z,
               origin='low',
               aspect='auto',
               interpolation='catrom',
               extent=[
                   resampled_x[0], resampled_x[-1], resampled_y[0],
                   resampled_y[-1]
               ],
               cmap='magma')
    plt.colorbar()
    plt.show()
示例#37
0
class KDEOrigin(object):
    """
    Initialise the class for generating the genesis probability distribution.
    Initialisation will load the required data (genesis locations) and
    calculate the optimum bandwidth for the kernel density method.

    :param str configFile: Path to the configuration file.
     :param dict gridLimit: The bounds of the model domain. The
                           :class:`dict` should contain the keys
                           :attr:`xMin`, :attr:`xMax`, :attr:`yMin`
                           and :attr:`yMax`. The *x* variable bounds
                           the longitude and the *y* variable
                           bounds the latitude.
    :param float kdeStep: Increment of the ordinate values at which
                          the distributions will be calculated.
                          Default=`0.1`
    :param lonLat: If given, a 2-d array of the longitude and latitude
                   of genesis locations. If not given, attempt to load
                   an ``init_lon_lat`` file from the processed files.
    :param progressbar: A :meth:`SimpleProgressBar` object to print
                        progress to STDOUT.
    :type  lonLat: :class:`numpy.ndarray`
    :type  progressbar: :class:`Utilities.progressbar` object.


    """
    def __init__(self,
                 configFile,
                 gridLimit,
                 kdeStep,
                 lonLat=None,
                 progressbar=None):
        """

        """
        self.progressbar = progressbar
        LOGGER.info("Initialising KDEOrigin")
        self.x = np.arange(gridLimit['xMin'], gridLimit['xMax'], kdeStep)
        self.y = np.arange(gridLimit['yMax'], gridLimit['yMin'], -kdeStep)

        self.kdeStep = kdeStep
        self.kde = None
        self.pdf = None
        self.cz = None

        self.configFile = configFile
        self.config = ConfigParser()
        self.config.read(configFile)

        if lonLat is None:
            # Load the data from file:
            self.outputPath = self.config.get('Output', 'Path')
            self.processPath = pjoin(self.outputPath, 'process')
            LOGGER.debug("Loading " + pjoin(self.processPath, 'init_lon_lat'))
            ll = flLoadFile(pjoin(self.processPath, 'init_lon_lat'), '%', ',')
            self.lonLat = ll[:, 0:2]
        else:
            self.lonLat = lonLat[:, 0:2]

        ii = np.where((self.lonLat[:, 0] >= gridLimit['xMin'])
                      & (self.lonLat[:, 0] <= gridLimit['xMax'])
                      & (self.lonLat[:, 1] >= gridLimit['yMin'])
                      & (self.lonLat[:, 1] <= gridLimit['yMax']))

        self.lonLat = self.lonLat[ii]

        self.bw = getOriginBandwidth(self.lonLat)
        LOGGER.info("Bandwidth: %s", repr(self.bw))

    def generateKDE(self, save=False, plot=False):
        """
        Generate the PDF for cyclone origins using kernel density
        estimation technique then save it to a file path provided by
        user.

        :param float bw: Optional, bandwidth to use for generating the PDF.
                         If not specified, use the :attr:`bw` attribute.
        :param boolean save: If ``True``, save the resulting PDF to a
                             netCDF file called 'originPDF.nc'.
        :param boolean plot: If ``True``, plot the resulting PDF.

        :returns: ``x`` and ``y`` grid and the PDF values.

        """

        self.kde = KDEMultivariate(self.lonLat, bw=self.bw, var_type='cc')
        xx, yy = np.meshgrid(self.x, self.y)
        xy = np.vstack([xx.ravel(), yy.ravel()])
        pdf = self.kde.pdf(data_predict=xy)
        pdf = pdf.reshape(xx.shape)

        self.pdf = pdf.transpose()

        if save:
            dimensions = {
                0: {
                    'name': 'lat',
                    'values': self.y,
                    'dtype': 'f',
                    'atts': {
                        'long_name': ' Latitude',
                        'units': 'degrees_north'
                    }
                },
                1: {
                    'name': 'lon',
                    'values': self.x,
                    'dtype': 'f',
                    'atts': {
                        'long_name': 'Longitude',
                        'units': 'degrees_east'
                    }
                }
            }

            variables = {
                0: {
                    'name': 'gpdf',
                    'dims': ('lat', 'lon'),
                    'values': np.array(pdf),
                    'dtype': 'f',
                    'atts': {
                        'long_name': 'TC Genesis probability distribution',
                        'units': ''
                    }
                }
            }

            ncSaveGrid(pjoin(self.processPath, 'originPDF.nc'), dimensions,
                       variables)

        if plot:
            from PlotInterface.maps import FilledContourMapFigure, \
                saveFigure, levels

            lvls, exponent = levels(pdf.max())

            [gx, gy] = np.meshgrid(self.x, self.y)

            map_kwargs = dict(llcrnrlon=self.x.min(),
                              llcrnrlat=self.y.min(),
                              urcrnrlon=self.x.max(),
                              urcrnrlat=self.y.max(),
                              projection='merc',
                              resolution='i')

            cbarlabel = r'Genesis probability ($\times 10^{' + \
                        str(exponent) + '}$)'
            figure = FilledContourMapFigure()
            figure.add(pdf * (10**-exponent), gx, gy, 'TC Genesis probability',
                       lvls * (10**-exponent), cbarlabel, map_kwargs)
            figure.plot()

            outputFile = pjoin(self.outputPath, 'plots', 'stats',
                               'originPDF.png')
            saveFigure(figure, outputFile)

        return self.x, self.y, self.pdf

    def generateCdf(self, save=False):
        """
        Generate the CDFs corresponding to PDFs of cyclone origins,
        then save it on a file path provided by user

        :param boolean save: If ``True``, save the CDF to a netcdf file
                             called 'originCDF.nc'. If ``False``, return
                             the CDF.

        """
        xx, yy = np.meshgrid(self.x, self.y)
        xy = np.vstack([xx.ravel(), yy.ravel()])
        self.cz = self.kde.cdf(data_predict=xy)

        if save:
            outputFile = pjoin(self.processPath, 'originCDF.nc')
            dimensions = {
                0: {
                    'name': 'lat',
                    'values': self.y,
                    'dtype': 'f',
                    'atts': {
                        'long_name': 'Latitude',
                        'units': 'degrees_north'
                    }
                },
                1: {
                    'name': 'lon',
                    'values': self.x,
                    'dtype': 'f',
                    'atts': {
                        'long_name': 'Longitude',
                        'units': 'degrees_east'
                    }
                }
            }

            variables = {
                0: {
                    'name': 'gcdf',
                    'dims': ('lat', 'lon'),
                    'values': np.array(self.cz),
                    'dtype': 'f',
                    'atts': {
                        'long_name': ('TC Genesis cumulative '
                                      'distribution'),
                        'units': ''
                    }
                }
            }

            ncSaveGrid(outputFile, dimensions, variables)
        else:
            return self.cz

    def updateProgressBar(self, step, stepMax):
        """
        Callback function to update progress bar from C code

        :param int n: Current step.
        :param int nMax: Maximum step.

        """
        if self.progressbar:
            self.progressbar.update(step / float(stepMax), 0.0, 0.7)
示例#38
0
def kde_m(x, x_grid, bandwidth):
    
    #kde = KDEMultivariate(x, bw=bandwidth * np.ones_like(x),var_type='c')
    kde = KDEMultivariate(x, bw=bandwidth, var_type='c')
    return kde.pdf(x_grid)
示例#39
0
def kde_statsmodels_m(x, x_grid, bandwidth=0.2, **kwargs):
    """Multivariate Kernel Density Estimation using Statsmodels"""
    kde = KDEMultivariate(x, bw=bandwidth * np.ones_like(x),
                          var_type='c', **kwargs)
    return kde.pdf(x_grid)
示例#40
0
文件: kde.py 项目: WMGoBuffs/biokit
 def kde_statsmodels_m(self, x_grid, bandwidth=0.2, **kwargs):
     """Multivariate Kernel Density Estimation with
     Statsmodels"""
     from statsmodels.nonparametric.kernel_density import KDEMultivariate
     kde = KDEMultivariate(self.data, bw=bandwidth * np.ones_like(x), var_type='c', **kwargs)
     return kde.pdf(x_grid)
示例#41
0
def wind_speed_pressure(year=2013,peak=False):
    from statsmodels.nonparametric.kernel_density import KDEMultivariate as KDE
    import robust as rb

    min2 = 0
    sigfac = 3
    sigsamp = 5

    d = get_data(year=year)
    if peak:
        wind = d['windhi']
        tag = 'peak'
        word = 'Peak '
    else:
        wind = d["wind"]
        tag = 'ave'
        word = 'Average '

    wind_rand = wind + np.random.normal(0,0.5,len(wind))
    press = d["pressure"]
    
    dist1 = press
    dist2 = wind_rand
    
    med1 = np.median(dist1)
    sig1 = rb.std(dist1)
    datamin1 = np.min(dist1)
    datamax1 = np.max(dist1)
    min1 = np.min(dist1)
    max1 = np.max(dist1)


    med2 = np.median(dist2)
    sig2 = rb.std(dist2)
    datamin2 = np.min(dist2)
    datamax2 = np.max(dist2)
    max2 = min(med2 + sigfac*sig2,datamax2)
    
    X, Y = np.mgrid[min1:max1:100j, min2:max2:100j]
    positions = np.vstack([X.ravel(), Y.ravel()])
    values = np.vstack([dist1, dist2])
    
    kernel = KDE(values,var_type='cc',bw=[sig1/sigsamp,sig2/sigsamp])
    Z = np.reshape(kernel.pdf(positions).T, X.shape)
    
    aspect = (max1-min1)/(max2-min2) * 8.5/11.0

    plot_params()
    plt.ion()
    plt.figure(5,figsize=(11,8.5))
    plt.clf()
    ax = plt.subplot(111)
    ax.imshow(np.rot90(Z), cmap=plt.cm.CMRmap_r,aspect=aspect, \
              extent=[min1, max1, min2, max2],origin='upper')
    ax.yaxis.labelpad = 12
    ax.set_xlabel('Atmospheric Pressure (in-Hg)',fontsize=fs)
    ax.set_ylabel(word+'Wind Speed (mph)',fontsize=fs)
    plt.title('Wind Speed and Pressure at Thacher Observatory in '+str(year),fontsize=fs)
    
    plt.savefig('Wind'+tag+'_Pressure_'+str(year)+'.png',dpi=300)
    mpl.rcdefaults()

    return
示例#42
0
def mode(self):
    kde = KDEMultivariate(data, var_type="c", bw="cv_ls")
    pdf = kde.pdf(np.linspace(np.min(data), np.max(data), (np.max(data)-np.min(data))*20))
    return max(pdf), kde.bw
示例#43
0
def kde_statsmodels_func(x: np.array):
    """Multivariate Kernel Density Estimation with Statsmodels returns a func"""
    kde = KDEMultivariate(x,
                          bw='cv_ml',
                          var_type='u')
    return lambda u: kde.pdf(u)
示例#44
0
def kde_statsmodels_m(x, x_grid, bandwidth=0.2, **kwargs):
    """Multivariate Kernel Density Estimation with Statsmodels, use with heterogeneous data"""
    kde = KDEMultivariate(x, bw=bandwidth * np.ones_like(x),
                          var_type='c', **kwargs)
    return kde.pdf(x_grid)