def EstimateDensity(self,name,df,histogram,f,s,ax):
     # if the desired output is in Histogram format
     if(histogram):
         finRes = []
         lab = []
         for i in xrange(5):
             res = np.array(df[ df[f] == i][s])
             if(res.shape[0]>0):
                 finRes.append(res)
                 lab.append(name[0]+ ' = ' + str(i))
         pl.hist(finRes, bins=2, normed=True, histtype='bar',label = lab)
         
     # if the desired output is simple plot
     else:
         for i in xrange(5):
             res = np.array(df[ df[f] == i][s])
             if(res.shape[0]>0):
                 res = res.reshape(res.shape[0],1)
                 X_plot = np.array(np.linspace(-1, 5,20)).reshape(20,1)
                 kde= KernelDensity(kernel='exponential', bandwidth=0.05)
                 kde.fit(res)
                 log_dens = kde.score_samples(X_plot)
                 ax.plot(X_plot,np.exp(log_dens),label=name[0]+ ' = ' + str(i))        
     ax.legend()
     ax.set_title(name[1] + "  distrubution for changing  " + name[0])
Пример #2
0
def surface_density(c, bandwidth=0.2, grid_step=0.02):
    """
    Given particle positions as a coordinate object, compute the
    surface density using a kernel density estimate.
    """

    if not HAS_SKLEARN:
        raise ImportError("scikit-learn is required to use this function.")

    xgrid = np.arange(2., 9.+0.1, grid_step) # deg
    ygrid = np.arange(26.5, 33.5+0.1, grid_step) # deg
    shp = (xgrid.size, ygrid.size)
    meshies = np.meshgrid(xgrid, ygrid)
    grid = np.vstack(map(np.ravel, meshies)).T

    x = c.l.degree
    y = c.b.degree
    skypos = np.vstack((x,y)).T

    kde = KernelDensity(bandwidth=bandwidth, kernel='epanechnikov')
    kde.fit(skypos)

    dens = np.exp(kde.score_samples(grid)).reshape(meshies[0].shape)
    log_dens = np.log10(dens)

    return grid, log_dens
Пример #3
0
def cistrans(args):
    cob = co.COB(args.cob) 
    if args.out == None:
        args.out = '{}_cistrans'.format(cob.name)
    # np.newaxis adds an empty axis in that position of the slice
    # the sklearn module requires the values to be in the rows:
    # http://scikit-learn.org/stable/auto_examples/neighbors/plot_kde_1d.html
    cis = cob.coex \
            .score[cob.coex.distance <= args.cis_distance]\
            .values[:,np.newaxis]
    trans = cob.coex\
            .score[np.isinf(cob.coex.distance)]\
            .values[:,np.newaxis]
    X_plot = np.linspace(-10,10,1000)[:,np.newaxis]
    print(
            'Found {:,} cis interactions and {:,} trans interactions'.format(
        cis.shape[0],
        trans.shape[0]
    ))
    # Fit the kernel
    kd=KernelDensity(bandwidth=0.2)
    kd.fit(cis)
    cis_kde = np.exp(kd.score_samples(X_plot))
    plt.fill(X_plot,cis_kde,alpha=0.5,label='Cis Interactions')
    # Fit the trans 
    kd.fit(trans[0:50000])
    trans_kde = np.exp(kd.score_samples(X_plot))
    plt.fill(X_plot,trans_kde,alpha=0.5,label='Trans Interactions')
    plt.legend()
    plt.title('Cis vs Trans Density: {}'.format(cob.name))
    # Calculate the mann whitney U test
    u,pval = sp.stats.mannwhitneyu(cis[:,0],trans[:,0]) 
    print('P-val: {}'.format(pval))
    plt.savefig(args.out+'.png')
Пример #4
0
def plot_sklearn_kde(df, support, column='AirTime', bins=50):
    """
    Plots a KDE and a histogram using sklearn.KernelDensity.
    Uses Gaussian kernels.
    The optimal bandwidth is calculated according to Silverman's rule of thumb.

    Parameters
    ----------
    df: A pandas.DataFrame
    support: A 1-d numpy array.
             Input data points for the probabilit density function.

    Returns
    -------
    A matplotlib.axes.Axes instance.
    """

    bw = get_silverman_bandwidth(df, column)

    kde = KernelDensity(kernel='gaussian', bandwidth=bw)

    x = df[column]

    kde.fit(x[:, np.newaxis])
    y = kde.score_samples(support[:, np.newaxis])

    fig, ax = plt.subplots(figsize=(8, 5))
    ax.hist(np.ravel(x), bins=bins, alpha=0.5, color=sns.xkcd_rgb["denim blue"], normed=True)
    ax.plot(support, np.exp(y))
    ax.set_xlabel(column, fontsize=14)
    ax.set_ylabel('Density', fontsize=14)
    ax.set_title('Kernel Density Plot', fontsize=14)
    sns.despine(ax=ax, offset=5, trim=True)

    return ax
Пример #5
0
def sklearn_kde(data, points):
    from sklearn.neighbors import KernelDensity

    # Silverman bandwidth estimator
    n, d = data.shape
    bandwidth = (n * (d + 2) / 4.)**(-1. / (d + 4))

    # standardize data so that we can use uniform bandwidth
    mu, sigma = mean(data, axis=0), std(data, axis=0)
    data, points = (data - mu)/sigma, (points - mu)/sigma

    #print("starting grid search for bandwidth over %d points"%n)
    #from sklearn.grid_search import GridSearchCV
    #from numpy import logspace
    #params = {'bandwidth': logspace(-1, 1, 20)}
    #fitter = GridSearchCV(KernelDensity(), params)
    #fitter.fit(data)
    #kde = fitter.best_estimator_
    #print("best bandwidth: {0}".format(kde.bandwidth))
    #import time; T0 = time.time()
    kde = KernelDensity(kernel='gaussian', bandwidth=bandwidth,
                        rtol=1e-6, atol=1e-6)
    #print("T:%6.3f   fitting"%(time.time()-T0))
    kde.fit(data)
    #print("T:%6.3f   estimating"%(time.time()-T0))
    log_pdf = kde.score_samples(points)
    #print("T:%6.3f   done"%(time.time()-T0))
    return exp(log_pdf)
Пример #6
0
def kde_sklearn(x, x_grid, bandwidth=0.2, **kwargs):
    """Kernel Density Estimation with Scikit-learn"""
    kde_skl = KernelDensity(bandwidth=bandwidth, **kwargs)
    kde_skl.fit(x[:, np.newaxis])
    # score_samples() returns the log-likelihood of the samples
    log_pdf = kde_skl.score_samples(x_grid[:, np.newaxis])
    return np.exp(log_pdf)
Пример #7
0
def kdescatter(xs, ys, log_color=False, atol=1e-4, rtol=1e-4,
               n_jobs=1, n_samp_scaling=100, n_samp_tuning=1000, ax=None,
               **kwargs):
    if ax is None:
        import matplotlib.pyplot as plt
        ax = plt

    kwargs.setdefault('linewidths', 0)
    kwargs.setdefault('s', 20)
    kwargs.setdefault('cmap', 'winter')

    X = np.asarray([xs, ys]).T
    n = X.shape[0]
    samp_X = X[np.random.choice(n, min(n_samp_scaling, n), replace=False)]
    median_sqdist = np.median(euclidean_distances(samp_X, squared=True))
    bws = np.logspace(-2, 2, num=10) * np.sqrt(median_sqdist)
    est = GridSearchCV(KernelDensity(), {'bandwidth': bws}, n_jobs=n_jobs)
    est.fit(X[np.random.choice(n, min(n_samp_tuning, n), replace=False)])
    bw = est.best_params_['bandwidth']

    kde = KernelDensity(bandwidth=bw)
    kde.fit(X)
    densities = kde.score_samples(X)
    if not log_color:
        np.exp(densities, out=densities)
    ax.scatter(xs, ys, c=densities, **kwargs)
def kde_opt4(df_cell_train_feats, y_train, df_cell_test_feats):
    def prepare_feats(df):
        df_new = pd.DataFrame()
        df_new["hour"] = df["hour"]
        df_new["weekday"] = df["weekday"] + df["hour"] / 24.
        df_new["accuracy"] = df["accuracy"].apply(lambda x: np.log10(x))
        df_new["x"] = df["x"]
        df_new["y"] = df["y"]
        return df_new
    logging.info("train kde_opt4 model")
    df_cell_train_feats_kde = prepare_feats(df_cell_train_feats)
    df_cell_test_feats_kde = prepare_feats(df_cell_test_feats)
    n_class = len(np.unique(y_train))
    y_test_pred = np.zeros((len(df_cell_test_feats_kde), n_class), "d")
    for i in range(n_class):
        X = df_cell_train_feats_kde[y_train == i]
        y_test_pred_i = np.ones(len(df_cell_test_feats_kde), "d")
        for feat in df_cell_train_feats_kde.columns.values:
            X_feat = X[feat].values
            BGK10_output = kdeBGK10(X_feat)
            if BGK10_output is None:
                kde = gaussian_kde(X_feat, "scott")
                kde = gaussian_kde(X_feat, kde.factor * 0.741379)
                y_test_pred_i *= kde.evaluate(df_cell_test_feats_kde[feat].values)
            else:
                bandwidth, mesh, density = BGK10_output
                kde = KernelDensity(kernel='gaussian', metric='manhattan', bandwidth=bandwidth)
                kde.fit(X_feat[:, np.newaxis])
                y_test_pred_i *= np.exp(kde.score_samples(df_cell_test_feats_kde[feat].values[:, np.newaxis]))
        y_test_pred[:, i] += y_test_pred_i
    return y_test_pred
def find_kernel(data, numgrid = 1000, bw = 0.002):
	Xtrain = data[:,0:2]
	ytrain = data[2]
	# Set up the data grid for the contour plot
	xgrid = np.linspace(-74.1, -73.65, numgrid=1000)
	ygrid = np.linspace(40.5, 40.8, numgrid=1000)
	X, Y = np.meshgrid(xgrid, ygrid)

	xy = np.vstack([Y.ravel(), X.ravel()]).T

	# Plot map of with distributions of each species
	fig = plt.figure()
    # construct a kernel density estimate of the distribution
	kde = KernelDensity(bandwidth=bw,
                    kernel='gaussian')
	kde.fit(Xtrain, y = ytrain)

 # evaluate only on the land: -9999 indicates ocean
	Z = np.exp(kde.score_samples(xy))
	Z = Z.reshape(X.shape)

    # plot contours of the density
	levels = np.linspace(0, Z.max(), 25)
	plt.contourf(X, Y, Z, levels=levels, cmap=plt.cm.Reds)
	plt.title('BK CRIME')
	plt.show()
	return Z
Пример #10
0
def sklearn_density(sample_points, evaluation_points):
    """
    Estimate the probability density function from which a set of sample
    points was drawn and return the estimated density at the evaluation points.
    """
    from sklearn.neighbors import KernelDensity

    # Silverman bandwidth estimator
    n, d = sample_points.shape
    bandwidth = (n * (d + 2) / 4.)**(-1. / (d + 4))

    # Standardize data so that we can use uniform bandwidth.
    # Note that we will need to scale the resulting density by sigma to
    # correct the area.
    mu, sigma = mean(sample_points, axis=0), std(sample_points, axis=0)
    data, points = (sample_points - mu)/sigma, (evaluation_points - mu)/sigma

    #print("starting grid search for bandwidth over %d points"%n)
    #from sklearn.grid_search import GridSearchCV
    #from numpy import logspace
    #params = {'bandwidth': logspace(-1, 1, 20)}
    #fitter = GridSearchCV(KernelDensity(), params)
    #fitter.fit(data)
    #kde = fitter.best_estimator_
    #print("best bandwidth: {0}".format(kde.bandwidth))
    #import time; T0 = time.time()
    kde = KernelDensity(kernel='gaussian', bandwidth=bandwidth,
                        rtol=1e-6, atol=1e-6)
    #print("T:%6.3f   fitting"%(time.time()-T0))
    kde.fit(data)
    #print("T:%6.3f   estimating"%(time.time()-T0))
    log_pdf = kde.score_samples(points)
    #print("T:%6.3f   done"%(time.time()-T0))
    return exp(log_pdf)/np.prod(sigma)  # undo the x scaling on the data points
Пример #11
0
def kde_sklearn(data, grid, **kwargs):
    """
    Kernel Density Estimation with Scikit-learn

    Parameters
    ----------
    data : numpy.array
        Data points used to compute a density estimator. It
        has `n x p` dimensions, representing n points and p
        variables.
    grid : numpy.array
        Data points at which the desity will be estimated. It
        has `m x p` dimensions, representing m points and p
        variables.

    Returns
    -------
    out : numpy.array
        Density estimate. Has `m x 1` dimensions
    """
    kde_skl = KernelDensity(**kwargs)
    kde_skl.fit(data)
    # score_samples() returns the log-likelihood of the samples
    log_pdf = kde_skl.score_samples(grid)
    return np.exp(log_pdf)
Пример #12
0
def set_plx_kde(t, bandwidth=0.3, method='sklearn_kde'):
    """ Set the plx_kde

    Parameters
    ----------
    t : ndarray float
        Catalog of parallax measures (units: mas)
    bandwidth : float
        Bandwidth for gaussian_kde (optional, 0.01 recommended)
    method : string
        Method for density determination (options: scipy_kde, sklearn_kde, blocks)
    """

    global plx_kde

    if method is 'scipy_kde':

        if plx_kde is None:
            # We are only going to allow parallaxes above some minimum value
            if bandwidth is None:
                plx_kde = gaussian_kde(t['plx'][t['plx']>0.0])
            else:
                plx_kde = gaussian_kde(t['plx'][t['plx']>0.0], bw_method=bandwidth)

    elif method is 'sklearn_kde':
        if plx_kde is None:
            kwargs = {'kernel':'tophat'}
            if bandwidth is None:
                plx_kde = KernelDensity(**kwargs)
            else:
                plx_kde = KernelDensity(bandwidth=bandwidth, **kwargs)

            if c.kde_subset:
                plx_ran = np.copy(t['plx'][t['plx']>0.0])
                np.random.shuffle(plx_ran)
                plx_kde.fit( plx_ran[0:5000, np.newaxis] )
            else:
                plx_kde.fit( t['plx'][t['plx']>0.0][:, np.newaxis] )

    elif method is 'blocks':
        global plx_bins_blocks
        global plx_hist_blocks

        # Set up Bayesian Blocks
        print("Calculating Bayesian Blocks...")
        nbins = np.min([len(t), 40000])
        bins = bayesian_blocks(t['plx'][t['plx']>0.0][0:nbins])
        hist, bins = np.histogram(t['plx'][t['plx']>0.0][0:nbins], bins=bins, normed=True)

        # Pad with zeros
        plx_bins_blocks = np.append(-1.0e100, bins)
        hist_pad = np.append(0.0, hist)
        plx_hist_blocks = np.append(hist_pad, 0.0)
        print("Bayesian Blocks set.")

    else:
        print("You must include a valid method")
        print("Options: kde or blocks")
        return
Пример #13
0
def estimate_density(city):
    """Return a Gaussian KDE of venues in `city`."""
    kde = KernelDensity(bandwidth=175, rtol=1e-4)
    surround = xp.build_surrounding(DB.venue, city, likes=-1, checkins=1)
    kde.fit(surround.venues[:, :2])
    max_density = approximate_maximum_density(kde, surround.venues[:, :2])
    # pylint: disable=E1101
    return lambda xy: np.exp(kde.score_samples(xy))/max_density
Пример #14
0
def time_sklearn():
    """
    Same as above, for scikit learn
    """
    global bandwidth, npoints, xmin, xmax, data
    sk_kde = KernelDensity(kernel = 'linear', bandwidth = bandwidth)
    sk_kde.fit(data[:,np.newaxis])
    grid = np.linspace(xmin, xmax, npoints)
    return np.exp(sk_kde.score_samples(grid[:,np.newaxis]))
Пример #15
0
def train_patient_flow_estimator(df, bandwidth=1.0):
    """Train density estimator based on patient metric"""
    X = df.drop(['ADMIT_DATE'], axis=1).values
    estimator = KernelDensity(bandwidth=bandwidth,
                              kernel='gaussian',
                              metric='pyfunc',
                              metric_params={'func': patient_metric})
    estimator.fit(X)
    return estimator
Пример #16
0
def test_sample_weight_invalid():
    # Check sample weighting raises errors.
    kde = KernelDensity()
    data = np.reshape([1.0, 2.0, 3.0], (-1, 1))

    sample_weight = [0.1, -0.2, 0.3]
    expected_err = "Negative values in data passed to `sample_weight`"
    with pytest.raises(ValueError, match=expected_err):
        kde.fit(data, sample_weight=sample_weight)
 def get_renyi_entropy(point, order, finding_status=None, pen_status=None):
     if finding_status is not None:
         point = Features.get_point_on(point, pen_status, finding_status)
     samples = np.array(point).reshape(-1, 1)
     kernel_density = KernelDensity(kernel='gaussian', bandwidth=0.2)
     kernel_density.fit(samples)
     log_probability = kernel_density.score_samples(samples)
     probability = np.exp(log_probability)
     return Features.__get_renyi_entropy(probability, order)
def CrossValidationScore(Xs,h, kernel='gaussian'):
    kde = KernelDensity(h, kernel=kernel)
    ret = 0.
    for i in range(len(Xs)):
        x = np.concatenate([Xs[0:i],Xs[i+1:-1]])
        kde.fit(x)
        ret +=kde.score_samples(Xs[i].reshape(1,-1)) 
    ret/=(1.*len(Xs))
    return ret
Пример #19
0
def test_sample_weight_invalid():
    # Check sample weighting raises errors.
    kde = KernelDensity()
    data = np.reshape([1., 2., 3.], (-1, 1))

    sample_weight = [0.1, -0.2, 0.3]
    expected_err = "sample_weight must have positive values"
    with pytest.raises(ValueError, match=expected_err):
        kde.fit(data, sample_weight=sample_weight)
Пример #20
0
class TwoClassKDE(object):
    """Class for Kernel Density Estimator on two labels. Likelihood ratio at a point is ratio of class-1 likelihood estimate to class-0 likelihood estimate, times the class odds, where this is calculated as the posterior mean estimate under Beta(1, 1) prior, given the observations. If no points are observed for one of the classes, a default (improper) uniform prior is assumed for that class. """
    def __init__(self, **kwargs):
        """Takes same parameters as KernelDensity estimator."""
        self.kde0 = KernelDensity(**kwargs)
        self.kde1 = KernelDensity(**kwargs)
    def fit(self, X, y):
        """Fits KDE models on the data. X is array of data points, y is array of 0-1 labels."""
        y = np.asarray(y, dtype = int)
        self.n0, self.n1 = (y == 0).sum(), (y == 1).sum()
        assert (self.n0 + self.n1 == len(y)), "y must be vector of 1's and 0's."
        X0, X1 = X[y == 0], X[y == 1]
        if (self.n0 > 0):
            self.kde0.fit(X0)
        if (self.n1 > 0):
            self.kde1.fit(X1)
    def fit_with_optimal_bandwidth(self, X, y, gridsize = 101, dynamic_range = 100, cv = 10, verbose = 0, n_jobs = 1):
        """Determines optimal bandwidth using the following strategy: For each subset (0 or 1) of the dataset, 1) set b = 1.06 * sigma * n^(-1/5), the Silverman's rule of thumb estimate for the optimal bandwidth. sigma is the sample standard deviation of the samples after zero-centering the columns (note: ideally each column will have comparable variance), 2) set up a grid (of size gridsize) of bandwidth values to try, ranging from b / alpha to b * alpha in geometric progression, where alpha = sqrt(dynamic_range), 3) compute average likelihood of the estimator on the data using cv-fold cross-validation, 4) select the bandwidth with the highest likelihood."""
        y = np.asarray(y, dtype = int)
        self.n0, self.n1 = (y == 0).sum(), (y == 1).sum()
        assert (self.n0 + self.n1 == len(y)), "y must be vector of 1's and 0's."
        X0, X1 = X[y == 0], X[y == 1]
        if (self.n0 > 0):
            log_b0 = np.log(1.06) + np.log((X0 - X0.mean(axis = 0)).std()) - 0.2 * np.log(self.n0)
            grid0 = GridSearchCV(self.kde0, {'bandwidth' : np.exp(np.linspace(log_b0 - 0.5 * np.log(dynamic_range), log_b0 + 0.5 * np.log(dynamic_range), gridsize))}, cv = cv, verbose = verbose, n_jobs = n_jobs)
            grid0.fit(X0)
            self.kde0 = grid0.best_estimator_
        if (self.n1 > 0):
            log_b1 = np.log(1.06) + np.log((X1 - X1.mean(axis = 0)).std()) - 0.2 * np.log(self.n1)
            grid1 = GridSearchCV(self.kde1, {'bandwidth' : np.exp(np.linspace(log_b1 - 0.5 * np.log(dynamic_range), log_b1 + 0.5 * np.log(dynamic_range), gridsize))}, cv = cv, verbose = verbose, n_jobs = n_jobs)
            grid1.fit(X1)
            self.kde1 = grid1.best_estimator_    
    def get_params(self, **kwargs):
        return self.kde0.get_params(**kwargs)
    def set_params(self, **params):
        self.kde0.set_params(**params)
        self.kde1.set_params(**params)
        return self
    def score_samples(self, X):
        """Evaluate the density model on the data. Returns vector of log-likelihood ratios of class 1 over class 0."""
        p1_est = (self.n1 + 1) / (self.n0 + self.n1 + 2)
        class_log_odds = np.log(p1_est) - np.log(1 - p1_est)
        scores0 = self.kde0.score_samples(X) if (self.n0 > 0) else np.zeros(len(X), dtype = float)
        scores1 = self.kde1.score_samples(X) if (self.n1 > 0) else np.zeros(len(X), dtype = float)
        return scores1 - scores0 + class_log_odds
    def score(self, X, y = None):
        """Compute the overall log-likelihood ratio under the model."""
        return self.score_samples(X).sum()
    def predict_proba(self, X):
        """Probability estimates."""
        scores = self.score_samples(X)
        p0s = 1 / (1 + np.exp(scores))
        return np.array([p0s, 1 - p0s]).transpose()
    def predict_log_proba(self, X):
        """Log of probability estimates."""
        return np.log(self.predict_proba(X))
Пример #21
0
 def kde_sklearn(self, x_grid, bandwidth=0.2, **kwargs):
     """Kernel Density Estimation with
     Scikit-learn"""
     from sklearn.neighbors import KernelDensity
     kde_skl = KernelDensity(bandwidth=bandwidth, **kwargs)
     kde_skl.fit(self.data[:, np.newaxis])
     # score_samples() returns the
     # log-likelihood of the samples
     log_pdf = kde_skl.score_samples(x_grid[:, np.newaxis])
     return np.exp(log_pdf)
Пример #22
0
def gaussian_pdf(column):
    x = column.values
    x_d = np.linspace(min(x), max(x), 10000)

    # instantiate and fit the KDE model
    kde = KernelDensity(bandwidth=0.01, kernel='gaussian')
    kde.fit(x[:, None])

    # score_samples returns the log of the probability density
    return kde.score_samples(x_d[:, None]), x_d
def calculate_spike_rate_kernel_smoothing(spike_times, end):
    if len(spike_times) == 0:
        return np.zeros(1000)
    x_d = np.linspace(0, end, 1000)
    spike_times = np.array(spike_times)
    model = KernelDensity(bandwidth=0.1, kernel='gaussian')
    model.fit(spike_times[:, None])
    log_dens = model.score_samples(x_d[:, None])
    whole_bin_spike_rate = len(spike_times) / 2
    return np.exp(log_dens) * whole_bin_spike_rate
Пример #24
0
def kde_estimator(x, y, x_grid, y_grid, bandwidth=0.2, **kwargs):
    """Kernel Density Estimation with Scikit-learn"""
    data = np.concatenate((x[:, None], y[:, None]), axis=1)
    data_grid = np.concatenate((x_grid[:, None], y_grid[:, None]), axis=1)

    kde_skl = KernelDensity(bandwidth=bandwidth, **kwargs)
    kde_skl.fit(data)
    # score_samples() returns the log-likelihood of the samples
    log_pdf = kde_skl.score_samples(data_grid)
    return np.exp(log_pdf)
def get_kde(file_path):
    data = pd.read_excel(file_path)
    features = np.array(data[[3]])
    features = scale(features)
    kde_ns = KernelDensity(kernel='gaussian', bandwidth=0.15)
    kde_ns.fit(features[:200])

    kde_ds = KernelDensity(kernel='gaussian', bandwidth=0.15)
    kde_ds.fit(features[-200:])
    return kde_ns, kde_ds
Пример #26
0
 def kde_sklearn(self, x_grid, bandwidth=0.2, **kwargs):
     """Kernel Density Estimation with
     Scikit-learn"""
     from sklearn.neighbors import KernelDensity
     kde_skl = KernelDensity(bandwidth=bandwidth,**kwargs)
     kde_skl.fit(self.data[:, np.newaxis])
     # score_samples() returns the
     # log-likelihood of the samples
     log_pdf = kde_skl.score_samples(x_grid[:, np.newaxis])
     return np.exp(log_pdf)
Пример #27
0
def kde2D(x, y, xbins=200, ybins=10, **kwargs):
    """ Estimate a 2 dimensional pdf via kernel density estimation """
    xx, yy = np.mgrid[x.min():x.max():(xbins * 1j), y.min():y.max():(ybins * 1j)]
    xy_sample = np.vstack([yy.ravel(), xx.ravel()]).T
    xy_train  = np.vstack([y, x]).T
    kde = KernelDensity(**kwargs)
    kde.fit(xy_train)
    # should this be exponential'd?
    z = np.exp(kde.score_samples(xy_sample))
    return xx[:, 0], yy[0, :], np.reshape(z, yy.shape)
Пример #28
0
def kde3d(x, y, z, data_point):
    values = np.vstack([x, y, z]).T
    # Use grid search cross-validation to optimize the bandwidth
    # params = {'bandwidth': np.logspace(-1, 1, 20)}
    kde = KernelDensity(bandwidth=0.3)
    kde.fit(values)
    kde_coords = kde.sample(10000)
    log_pdf = kde.score_samples(kde_coords)
    percentile = np.sum(log_pdf < kde.score(data_point))/10000.
    return (percentile)
Пример #29
0
    def GetDensity(self, action='generate', samples=100, draws=None):
        """
        TODO: Check density calculations for multiple dimensions.
        Generate a density estimation of the positions at each time or sample positions
        from the generated density at a specified time.
        
        Parameters
        ----------
        action : (string) - Options: 'generate', 'sample'.
            'generate' : Generate a density estimation using kernel density estimation and save it.
            'sample' : Generate a density estimation at the final time and both draw and return samples
                from it equal to the number of points in position.
        samples : (int) - The number of sample points in each dimension at which to measure the density.
            Total number of points is samples ** dimensions.
        draws : (int) - The number of points to draw from the density distribution, if None,
            draw a number of points equal to the number of points in Positions.
            
        Returns
        -------
        'generate'        
            DensitySamples : (np.array) - An array of the positions of the points used to sample the density.
            Density : (np.array) - The value of the density evaluated at each points in DensitySamples.
        'sample'
            samples : (np.array) - An array of the samples drawn from the density generated from the positions
                at the final time.
        """
        if action == 'generate':
            #           A list of sample arrays ranging from the min value to the max value in each dimension.
            minmax = [
                np.linspace(np.amin(self.Positions[:, i, :]),
                            np.amax(self.Positions[:, i, :]), samples)
                for i in range(self.Positions.shape[1])
            ]
            self.DensitySamples = np.array(list(product(*minmax)))
            self.Density = np.zeros(
                (self.DensitySamples.shape[0], self.Times.shape[0]))

            for i in range(self.Positions.shape[2]):
                bandwidth = 0.2 * np.mean(pdist(self.Positions[:, :, i]))
                KDE = KernelDensity(bandwidth=bandwidth,
                                    kernel='gaussian',
                                    metric='euclidean')
                KDE.fit(self.Positions[:, :, i])
                self.Density[:, i] = np.exp(
                    KDE.score_samples(self.DensitySamples))

        elif action == 'sample':
            bandwidth = min(pdist(self.Positions[-1, :][:, np.newaxis]))
            KDE = KernelDensity(bandwidth=bandwidth,
                                kernel='gaussian',
                                metric='euclidean')
            KDE.fit(self.Positions[-1, :][:, np.newaxis])
            if draws is None:
                draws = self.Positions.shape[1]
            return KDE.sample(draws)
Пример #30
0
class Density:
    def __init__(self, params: DensityParams) -> None:
        self.__params = self.__params_type_checked(params)
        self.__data = Manager().dict()
        controller_params = ControllerParams(self.__params, self.__data)
        self.__controller = Controller(controller_params)
        self.__density = frombuffer(self.__controller.gridmap.output.get_obj())
        self.__kde = KernelDensity(bandwidth=self.__params.kernel.bandwidth,
                                   kernel=self.__params.kernel.name,
                                   **KDE_PARAMETERS)
        self.__grid = self.__grid_from_params()  # Remove line!

    @property
    def control(self) -> Controller:
        return self.__controller

    @property
    def on_grid(self) -> ndarray:
        return self.__density.reshape(self.__params.grid.shape)

    def at(self, point: PointAt) -> float:
        if self.__data:
            point = self.__point_type_and_range_checked(point)
            n_points = len(self.__data)
            self.__kde.fit(self.__data.values())
            density = exp(self.__kde.score_samples(point.position[None]))
            return float(n_points * density)
        else:
            return 0.0

    def _compute_on_grid(self) -> ndarray:  # Remove line!
        if self.__data:  # Remove line!
            n_points = len(self.__data)  # Remove line!
            self.__kde.fit(self.__data.values())  # Remove line!
            density_on_grid = exp(self.__kde.score_samples(self.__grid))  #
            return n_points * density_on_grid.reshape(self.__params.grid.shape)

    @staticmethod
    def __params_type_checked(value: DensityParams) -> DensityParams:
        if type(value) is not DensityParams:
            raise TypeError('Parameters must be of type <DensityParams>!')
        return value

    def __point_type_and_range_checked(self, value: PointAt) -> PointAt:
        if type(value) is not PointAt:
            raise TypeError('Data point must be of type <PointAt>!')
        if not self.__params.bounds.contain(value):
            raise ValueError('Data point lies outside bounding box!')
        return value

    def __grid_from_params(self) -> ndarray:  # Remove line!
        x_line = linspace(*self.__params.bounds.x_range, self.__params.grid.x)
        y_line = linspace(*self.__params.bounds.y_range, self.__params.grid.y)
        x_grid, y_grid = meshgrid(x_line, y_line)  # Remove line!
        return column_stack((x_grid.ravel(), y_grid.ravel()))  # Remove line!
Пример #31
0
def data_entropy(X, n_grid=1000, kernel=False, bandwidth=0.2, **kwargs):
    """
    Computes unidimensional entropy from data points.

    @param X Input matrix [n_samples, n_features].
    @param n_grid Number of grid points. Integer.
    @param kernel Boolean to set kernel method on/off.
    @param bandwidth Kernel bandwidth. Scalar.

    @return Vector of [n_features], with the corresponding feature entropy.
    """

    # Testing X dimension
    if (len(X.shape) > 1):

        # Computing per axis
        ent_h = np.apply_along_axis(data_entropy, 0, X, n_grid, kernel,\
         bandwidth, **kwargs)
        ent_h = np.array(ent_h)

    else:

        # Finding sample range
        x_max = X.max()
        x_min = X.min()

        # Testing for kernel method
        if kernel:

            # Computing random sampling
            rnd_idx = np.random.choice(X.shape[0], size=n_grid,\
             replace=False)

            # Kernel density estimation
            kde = KernelDensity(bandwidth=bandwidth, **kwargs)
            kde.fit(X[rnd_idx, np.newaxis])

            # Computing distro
            x_grid = np.linspace(x_min, x_max, n_grid)
            pdf = kde.score_samples(x_grid[:, np.newaxis])  # Log-likelihood
            pdf = np.exp(pdf)  # Distribution estimation

        else:

            # Computing grid
            x_grid = np.arange(x_min, x_max, bandwidth)

            # Computing histogram
            pdf, _ = np.histogram(X, bins=x_grid, density=True)

        # Computing entropy
        ent_h = entropy(pdf)

    # Return entropy
    return ent_h
Пример #32
0
 def downsample(self, X, n):
     # we've already fit()ted, but we're worried that our X is so
     # large our classifier will be too slow in practice. we can
     # downsample by running a kde on X and sampling from it (this
     # will be slow, but happens only once), and then using those
     # points as the new X.
     if len(X) < n:
         return X
     kde = KernelDensity()
     kde.fit(X)
     return kde.sample(n)
Пример #33
0
 def downsample(self, X, n):
     # we've already fit()ted, but we're worried that our X is so
     # large our classifier will be too slow in practice. we can
     # downsample by running a kde on X and sampling from it (this
     # will be slow, but happens only once), and then using those
     # points as the new X.
     if len(X) < n:
         return X
     kde = KernelDensity()
     kde.fit(X)
     return kde.sample(n)
Пример #34
0
class WhitenedKDE(BaseEstimator, DensityMixin):
    def __init__(self, **kwargs):
        self.kde = KernelDensity(**kwargs)
        self.pre_whiten = PCA(whiten=True)

    def fit(self, X, y=None, sample_weight=None):
        self.kde.fit(self.pre_whiten.fit_transform(X))
        return self

    def score_samples(self, X):
        return self.kde.score_samples(self.pre_whiten.transform(X))
Пример #35
0
 def _build():
     data = make_example_date()
     data = pd.concat(
         [data[data.blobID != 2], data[data.blobID == 2].sample(frac=0.25)])
     d = data['feature0'].values
     density = KernelDensity(bandwidth=0.5, kernel='gaussian')
     density.fit(d[:, None])
     x_d = np.linspace(min(d), max(d), 1000)
     prob = np.exp(density.score_samples(x_d[:, None]))
     peaks = find_peaks(prob)[0]
     return prob, peaks, x_d
Пример #36
0
def calc_kerneldensity(df, aux_grid):
    hist_aux = []
    for i in range(0, df.shape[1]):
        kde_skl = KernelDensity(bandwidth=0.4)
        #aux = np.array(df_n['1000.0'])
        aux = np.copy(df[:, i])
        kde_skl.fit(aux[:, np.newaxis])
        log_pdf = kde_skl.score_samples(aux_grid[:, np.newaxis])
        hist_aux.append(np.exp(log_pdf) * 100)

    return hist_aux
Пример #37
0
def getKDF(data, ax=None, **kwargs):
    """This function ingests a 1d array or list and 
    returns a plot object of a kernal
    density function"""
    data = data.values[:, None]
    ax = ax or plt.gca()
    xValues = np.linspace(min(data), max(data), 1000)
    kdensity = KernelDensity(**kwargs)
    kdensity.fit(data)
    logscore = kdensity.score_samples(xValues)
    return ax.plot(xValues, np.exp(logscore), 'r--', linewidth=3)
Пример #38
0
    def perform_kde(self, data):
        """Perform kernel density estimation (KDE)

        Parameters
        ----------
        data : np.array
            sample data (1D)

        Returns
        -------
        np.array, np.array, float
            x values, y values, bandwidth

        .. _scikit-learn tutorial 1:
            https://scikit-learn.org/stable/auto_examples/neighbors/plot_kde_1d
            .html
        .. _scikit-learn tutorial 2:
            https://scikit-learn.org/stable/auto_examples/neighbors/plot_digits_
            kde_sampling.html
        .. _A useful article:
            https://jakevdp.github.io/blog/2013/12/01/kernel-density-estimation/
        """
        bw = self.bandwidth
        data_ = data[:, np.newaxis]
        scaler = StandardScaler()
        data_ = scaler.fit_transform(data[:, np.newaxis])
        estimator = KernelDensity(kernel='gaussian')

        # grid search optimization
        if bw == 'grid':
            kde = self.grid_kde(data_, estimator, self.bw_steps)
            bw = kde.bandwidth
            print('  Grid search-optimized bandwidth: {:g}.'.format(bw))

        # Silverman's rule-of-thumb
        elif bw == 'silverman':
            bw = self.silverman_bw(data)
            print(
                '  Bandwidth by Silverman\'s rule-of-thumb: {:g}.'.format(bw))
            setattr(estimator, 'bandwidth', bw)
            kde = estimator.fit(data_)

        # fixed bandwidth value
        elif isinstance(bw, float) and 0.1 <= bw <= 1.0:
            setattr(estimator, 'bandwidth', bw)
            kde = estimator.fit(data_)
        else:
            raise ValueError('Invalid bandwidth: {}.'.format(bw))

        # get density function
        x, y = self.density_func(data_, kde)
        x = scaler.inverse_transform(x)
        y = scaler.inverse_transform(y)
        return x, y, bw
Пример #39
0
class DensityBasedOneClassClassifier:
    def __init__(self,
                 threshold=0.95,
                 kernel="gaussian",
                 bandwidth=1.0,
                 metric="euclidean",
                 should_downsample=False,
                 downsample_count=1000):

        self.should_downsample = should_downsample
        self.downsample_count = downsample_count
        self.threshold = threshold
        self.scaler = preprocessing.StandardScaler()
        if kernel == "really_linear":
            self.dens = NegativeMeanDistance(metric=metric)
        else:
            self.dens = KernelDensity(bandwidth=bandwidth,
                                      kernel=kernel,
                                      metric=metric)

    def fit(self, X):
        # scale
        self.scaler.fit(X)
        self.X = self.scaler.transform(X)
        # downsample?
        if self.should_downsample:
            self.X = self.downsample(self.X, self.downsample_count)
        # fit
        self.dens.fit(self.X)
        # transform relative threshold (eg 95%) to absolute
        dens = self.get_density(self.X, scale=False)  # no need to scale again
        self.abs_threshold = np.percentile(dens, 100 * (1 - self.threshold))

    def get_density(self, X, scale=True):
        if scale:
            X = self.scaler.transform(X)
        # in negative log-prob (for KDE), in negative distance (for NegativeMeanDistance)
        return self.dens.score_samples(X)

    def predict(self, X):
        dens = self.get_density(X)
        return dens < self.abs_threshold  # in both KDE and NMD, lower values are more anomalous

    def downsample(self, X, n):
        # we've already fit()ted, but we're worried that our X is so
        # large our classifier will be too slow in practice. we can
        # downsample by running a kde on X and sampling from it (this
        # will be slow, but happens only once), and then using those
        # points as the new X.
        if len(X) < n:
            return X
        kde = KernelDensity()
        kde.fit(X)
        return kde.sample(n)
Пример #40
0
def testAccuracy(X_train, y_train, X_test, y_test, output_result):

    for kernel in [
            'gaussian', 'tophat', 'epanechnikov', 'exponential', 'linear',
            'cosine'
    ]:
        time_start = time.time()
        model = KernelDensity(kernel=kernel)
        model.fit(X_train, y_train)
        time_training = time.time() - time_start
        output_result(model, X_train, y_train, X_test, y_test, time_training)
def kde2D(x, y, bandwidth, xbins=256j, ybins=256j, **kwargs):
    xx, yy = np.mgrid[x.min():x.max():xbins, y.min():y.max():ybins]

    xy_sample = np.vstack([yy.ravel(), xx.ravel()]).T
    xy_train = np.vstack([y, x]).T

    kde_skl = KernelDensity(bandwidth=bandwidth, **kwargs)
    kde_skl.fit(xy_train)

    z = np.exp(kde_skl.score_samples(xy_sample))
    return xx, yy, np.reshape(z, xx.shape)
Пример #42
0
class KDEDist(object):
    def __init__(self, bw, kernel='gaussian'):
        self._bw = bw
        self._kernel = kernel
        self._kd = KernelDensity(bandwidth=bw, kernel=kernel)
        self._samples = None

    @staticmethod
    def bw_range(x, n=3):
        max_pwr = 2
        h_opt = np.std(x) * (4. / (3. * len(x)))**0.2
        pwrs = np.concatenate([
            np.linspace(-max_pwr, 0, n + 1),
            np.linspace(0, max_pwr, n + 1)[1:]
        ])
        return h_opt * 2**pwrs

    @property
    def name(self):
        return 'KDE({}, {:.5f})'.format(self._kernel, self._bw)

    @property
    def samples(self):
        if self._samples is None:
            self._samples = self.rvs(100000)
        return self._samples

    def dist(self):
        return self

    def fit(self, x):
        self._kd.fit(np.reshape(x, (len(x), 1)))
        return self

    def logpdf(self, x):
        return self._kd.score_samples(np.reshape(x, (len(x), 1)))

    def rvs(self, n):
        return self._kd.sample(n).reshape(n)

    def stats(self, moments='mv'):
        out = []
        if 'm' in moments:
            out.append(np.array([np.mean(self.samples)]))
        if 'v' in moments:
            out.append(np.array([np.var(self.samples)]))
        if 's' in moments:
            out.append(np.array([skew(self.samples)]))
        if 'k' in moments:
            out.append(np.array([kurtosis(self.samples)]))
        return tuple(out)

    def ppf(self, q):
        return np.percentile(self.samples, q)
Пример #43
0
def get_mode(vals):
    h = 1.06 * np.std(vals) * len(vals)**(-1.0 / 5.0)
    kdf = KernelDensity(bandwidth=h)
    kdf.fit(np.array(vals).reshape(len(vals), 1))

    def neg_kdf(x):
        return -kdf.score(np.array((x, )))

    res = minimize(neg_kdf, x0=np.median(vals), method='Nelder-Mead')
    assert res.success, res
    return float(res.x)
Пример #44
0
def alternate_umi(x,
                  y,
                  k=5,
                  density_estimation_method="kde",
                  k_density=5,
                  bw=.2):
    assert len(x) == len(y), "Lists should have same length"
    assert k <= len(x) - 1, "Set k smaller than num. samples - 1"
    N = len(x)
    dx = len(x[0])
    dy = len(y[0])
    data = np.concatenate((x, y), axis=1)

    tree_xy = ss.cKDTree(data)
    tree_x = ss.cKDTree(x)
    tree_y = ss.cKDTree(y)

    if density_estimation_method.lower() == "kde":
        kernel = KernelDensity(bandwidth=bw)
        kernel.fit(x)
        kde = np.exp(kernel.score_samples(x))
        weight = (1 / kde) / np.mean(1 / kde)

    elif density_estimation_method.lower() == "knn":
        knn_dis = [
            tree_x.query(point, k_density + 1, p=np.inf)[0][k_density]
            for point in x
        ]
        density_estimate = np.array([
            float(k_density) / N / knn_dis[i]**dx for i in range(len(knn_dis))
        ])
        weight = (1 / density_estimate) / np.mean(1 / density_estimate)

    else:
        raise ValueError("The density estimation method is not recognized")

    knn_dis = [tree_xy.query(point, k + 1, p=2)[0][k] for point in data]
    ans = log(k) + 2 * log(N - 1) - digamma(N) + vd(dx) + vd(dy) - vd(dx + dy)

    # weight_y = np.zeros(N)
    # for i in range(N):
    #     weight_y[i] = np.sum(weight[j] for j in tree_y.query_ball_point(y[i], knn_dis[i], p=2)) - weight[i]
    # weight_y *= N/np.sum(weight_y)

    for i in range(N):
        nx = len(tree_x.query_ball_point(x[i], knn_dis[i], p=2)) - 1
        ny = len(tree_y.query_ball_point(y[i], knn_dis[i], p=2)) - 1
        ans += -weight[i] * log(nx) / N
        ans += -weight[i] * log(ny) / N
        # for j in tree_y.query_ball_point(y[i], knn_dis[i], p=2):
        # ans += -weight[j] * log(weight[j]) /N/ny
        # ans += -weight[i] * log(weight[i]) / N

    return ans
Пример #45
0
Файл: ml.py Проект: rramosp/rlx
def kdensity(x):
    import numbers
    if len(x.shape) != 1:
        raise ValueError("x must be a vector. found " + str(x.shape) +
                         " dimensions")
    stdx = np.std(x)
    bw = 1.06 * stdx * len(x)**-.2 if stdx != 0 else 1.
    kd = KernelDensity(bandwidth=bw)
    kd.fit(x.reshape(-1, 1))

    func = lambda z: np.exp(kd.score_samples(np.array(z).reshape(-1, 1)))
    return func
Пример #46
0
class colorKDE(object):
	def __init__(self,data=np.array([])):
		self.data = data
	
		
	def runKDE(self,bandwidth=0.2,use_opt=False):
		'''
		Generate the KDE and run with given bandwith
		
		If use_opt is specified, ruCVSearch must have been run already
		'''
		if use_opt:
			self.kde = KernelDensity(bandwidth=self.optimal_bandwidth)
		else:
			self.kde = KernelDensity(bandwidth=bandwidth)
		
		self.kde.fit(self.data)
		
	def runCVSearch(self,search_range=np.linspace(0.01,1.0,50),folds=20):
		self.grid = GridSearchCV(KernelDensity(),{'bandwidth':search_range},\
			cv=folds)
		self.grid.fit(self.data)
		self.optimal_bandwidth=self.grid.best_params_['bandwidth']
		print 'Optimal bandwidth: ' + str(self.optimal_bandwidth)
		
	def score_samples(self,x):
		'''
		Replicate score_samples functionality so both saves
		can be treated the same
		'''
		return self.kde.score_samples(x)
		
	def sample(self,n_samples):
		'''
		Replicate samples functionality so both saves
		can be treated the same
		'''
		return self.kde.sample(n_samples=n_samples)
		
	
	def save(self,filename,full=True):
		'''
		Save current state of the object
		
		If full is false, only save self.kde
		'''
		if full:
			#save the entire object, including data
			pickle.dump(self,open(filename,'wb'),protocol=-1)
			
		else:
			#only save the .kde object
			pickle.dump(self.kde,open(filename,'wb'),protocol=-1)
Пример #47
0
    def __call__(self, **kwargs):
        """
        Runs block of analysis
        """
        from sklearn.neighbors import KernelDensity

        kde     = KernelDensity(bandwidth = self.bandwidth, **kwargs)
        kde.fit(self.coord.flatten()[:, np.newaxis])
        log_pdf = kde.score_samples(self.grid[:, np.newaxis])
        pdf     = np.exp(log_pdf)

        self.datasets[self.outputs[0]]["kde"] = pdf
Пример #48
0
def kde_dist(ax, x, bw=None, color='k'):
    x_grid = np.linspace(np.min(x), np.max(x), 1000)
    if bw == None:
        bw = np.std(x)*float(len(x))**(-1/5.)
    kde_skl = KernelDensity(bandwidth=bw)
    kde_skl.fit(x[:, np.newaxis])
    # score_samples() returns the log-likelihood of the samples
    log_pdf = kde_skl.score_samples(x_grid[:, np.newaxis])
    est = np.exp(log_pdf)
    ax.plot(x_grid, est, color=color, lw=0.5)
    
    return est
Пример #49
0
def kde_sklearn(x, x_grid, bandwidth=0.2, **kwargs):
    """Kernel Density Estimation with Scikit-learn"""

    # kde from https://jakevdp.github.io/blog/2013/12/01/kernel-density-estimation/

    kde_skl = KernelDensity(bandwidth=bandwidth, **kwargs)
    kde_skl.fit(x[:, np.newaxis])

    # score_samples() returns the log-likelihood of the samples

    log_pdf = kde_skl.score_samples(x_grid[:, np.newaxis])
    return np.exp(log_pdf)
Пример #50
0
def get_n_m_kde(magnitude, bin_centre, area, bandwidth=0.2):
    """Compute n(m)
    Density of sources per unit of area in a non-cumulative
    fashion using a KDE.
    For this function we need the centre of the bins instead 
    of the edges.
    **Note that the output is non-cumulative**
    """
    kde_skl = KernelDensity(bandwidth=bandwidth)
    kde_skl.fit(magnitude[:, np.newaxis])
    pdf = np.exp(kde_skl.score_samples(bin_centre[:, np.newaxis]))
    return pdf/area*len(magnitude)/np.sum(pdf)
Пример #51
0
    def fn(inst):
        if not 'x' in inst:
            raise Exception('no x')

        x = inst['x']

        kde = KernelDensity(*args, **margs)
        kde.fit(x)

        log_pdf = kde.score_samples(x)
        pdf = np.exp(log_pdf)

        return inst.set('model', kde).set('pdf', pdf)
Пример #52
0
def train_kde(xy, label):
    params = {"bandwidth": np.logspace(-5, 5, 20), "kernel": ["gaussian", "exponential"]}
    # do a grid search
    try:
        grid = GridSearchCV(KernelDensity(metric="haversine", algorithm="ball_tree"), params)
        grid.fit(xy)
        return grid.best_estimator_
    except ValueError:
        k = KernelDensity(
            metric="haversine", algorithm="ball_tree", bandwidth=best_global_bandwidths[label], kernel="exponential"
        )
        k.fit(xy)
        return k
Пример #53
0
def test_kde_algorithm_metric_choice(algorithm, metric):
    # Smoke test for various metrics and algorithms
    rng = np.random.RandomState(0)
    X = rng.randn(10, 2)    # 2 features required for haversine dist.
    Y = rng.randn(10, 2)

    if algorithm == 'kd_tree' and metric not in KDTree.valid_metrics:
        assert_raises(ValueError, KernelDensity,
                      algorithm=algorithm, metric=metric)
    else:
        kde = KernelDensity(algorithm=algorithm, metric=metric)
        kde.fit(X)
        y_dens = kde.score_samples(Y)
        assert_equal(y_dens.shape, Y.shape[:1])
Пример #54
0
def mode(x):

    x = np.array(x)
    
    # fit kde
    kde_skl = KernelDensity()
    kde_skl.fit(x[:, np.newaxis])

    # find max on log grid
    log_min = np.log(min(x)) / np.log(10)
    log_max = np.log(max(x)) / np.log(10)
    x_grid = np.logspace(log_min, log_max, 100000)
    log_pdf = kde_skl.score_samples(x_grid[:, np.newaxis])
    return x_grid[log_pdf.argmax()]
Пример #55
0
def get_P_binary_v_tot(proj_sep, delta_v_tot, num_sys=100000):
    """ This function calculates the probability of a
    random star having the observed proper motion

    Parameters
    ----------
    proj_sep : float
        Projected separation between two stars
    delta_v_tot : float
        Total velocity difference between two stars

    Returns
    -------
    P(proj_sep, delta_v_tot) : float
        Probability that angular separation, pm+RV difference
        is due to a genuine binary
    """

    # Catalog check
    global binary_set

    if binary_set is None:
        generate_binary_set(num_sys=num_sys)

    # Use a Gaussian KDE
    global binary_v_tot_kde
    # We work in log space for the set of binaries

    if binary_v_tot_kde is None:
        kwargs = {'kernel':'tophat'}
        binary_v_tot_kde = KernelDensity(bandwidth=0.1, **kwargs)
        binary_v_tot_kde.fit( np.array([np.log10(binary_set['proj_sep']), np.log10(binary_set['delta_v_tot'])]).T )

    if isinstance(delta_v_tot, np.ndarray) and isinstance(proj_sep, np.ndarray):
        values = np.array([np.log10(proj_sep), np.log10(delta_v_tot)]).T
        prob_binary = np.exp(binary_v_tot_kde.score_samples(values))

    elif isinstance(delta_v_tot, np.ndarray):
        values = np.array([np.log10(proj_sep)*np.ones(len(delta_v_tot)), np.log10(delta_v_tot)]).T
        prob_binary = np.exp(binary_v_tot_kde.score_samples(values))
    else:
        prob_binary = np.exp(binary_v_tot_kde.score_samples([np.log10(proj_sep), np.log10(delta_v_tot)]))


    # Convert back from log10-space to linear-space
    # the log(10) terms convert from log10 to ln
    prob_binary = prob_binary / (proj_sep*np.log(10.)) / (delta_v_tot*np.log(10.))

    return prob_binary
Пример #56
0
def kde_sklearn(x, x_grid, bandwidth=0.2, **kwargs):
    """Kernel Density Estimation with Scikit-learn"""
    kde_skl = KernelDensity(bandwidth=bandwidth, **kwargs)
    
    x = ul.fnp(x)
    x_grid = ul.fnp(x_grid)
    print (x.shape)
#    if (x.shape[1] == 1):
#        x = x[:, np.newaxis]
#        x_grid = x_grid[:, np.newaxis]
        
    kde_skl.fit(x)
    # score_samples() returns the log-likelihood of the samples
    log_pdf = kde_skl.score_samples(x_grid)
    return np.exp(log_pdf)
Пример #57
0
def kde(data, rng, grid_size=10,  **kwargs):
    """Kernel Density Estimation with Scikit-learn"""
    n_samples = data.shape[0]
    n_dims = data.shape[1]

    bandwidth = (n_samples * (n_dims + 2) / 4.)**(-1. / (n_dims + 4.))
    kde_skl = KernelDensity(bandwidth=bandwidth, **kwargs)
    kde_skl.fit(data)

    space = [linspace(i[0], i[1], grid_size) for i in rng]
    grid = meshgrid(*tuple(space))

    # score_samples() returns the log-likelihood of the samples
    log_pdf = kde_skl.score_samples(vstack(map(ravel, grid)).T)
    return exp(log_pdf), space