def EstimateDensity(self,name,df,histogram,f,s,ax): # if the desired output is in Histogram format if(histogram): finRes = [] lab = [] for i in xrange(5): res = np.array(df[ df[f] == i][s]) if(res.shape[0]>0): finRes.append(res) lab.append(name[0]+ ' = ' + str(i)) pl.hist(finRes, bins=2, normed=True, histtype='bar',label = lab) # if the desired output is simple plot else: for i in xrange(5): res = np.array(df[ df[f] == i][s]) if(res.shape[0]>0): res = res.reshape(res.shape[0],1) X_plot = np.array(np.linspace(-1, 5,20)).reshape(20,1) kde= KernelDensity(kernel='exponential', bandwidth=0.05) kde.fit(res) log_dens = kde.score_samples(X_plot) ax.plot(X_plot,np.exp(log_dens),label=name[0]+ ' = ' + str(i)) ax.legend() ax.set_title(name[1] + " distrubution for changing " + name[0])
def surface_density(c, bandwidth=0.2, grid_step=0.02): """ Given particle positions as a coordinate object, compute the surface density using a kernel density estimate. """ if not HAS_SKLEARN: raise ImportError("scikit-learn is required to use this function.") xgrid = np.arange(2., 9.+0.1, grid_step) # deg ygrid = np.arange(26.5, 33.5+0.1, grid_step) # deg shp = (xgrid.size, ygrid.size) meshies = np.meshgrid(xgrid, ygrid) grid = np.vstack(map(np.ravel, meshies)).T x = c.l.degree y = c.b.degree skypos = np.vstack((x,y)).T kde = KernelDensity(bandwidth=bandwidth, kernel='epanechnikov') kde.fit(skypos) dens = np.exp(kde.score_samples(grid)).reshape(meshies[0].shape) log_dens = np.log10(dens) return grid, log_dens
def cistrans(args): cob = co.COB(args.cob) if args.out == None: args.out = '{}_cistrans'.format(cob.name) # np.newaxis adds an empty axis in that position of the slice # the sklearn module requires the values to be in the rows: # http://scikit-learn.org/stable/auto_examples/neighbors/plot_kde_1d.html cis = cob.coex \ .score[cob.coex.distance <= args.cis_distance]\ .values[:,np.newaxis] trans = cob.coex\ .score[np.isinf(cob.coex.distance)]\ .values[:,np.newaxis] X_plot = np.linspace(-10,10,1000)[:,np.newaxis] print( 'Found {:,} cis interactions and {:,} trans interactions'.format( cis.shape[0], trans.shape[0] )) # Fit the kernel kd=KernelDensity(bandwidth=0.2) kd.fit(cis) cis_kde = np.exp(kd.score_samples(X_plot)) plt.fill(X_plot,cis_kde,alpha=0.5,label='Cis Interactions') # Fit the trans kd.fit(trans[0:50000]) trans_kde = np.exp(kd.score_samples(X_plot)) plt.fill(X_plot,trans_kde,alpha=0.5,label='Trans Interactions') plt.legend() plt.title('Cis vs Trans Density: {}'.format(cob.name)) # Calculate the mann whitney U test u,pval = sp.stats.mannwhitneyu(cis[:,0],trans[:,0]) print('P-val: {}'.format(pval)) plt.savefig(args.out+'.png')
def plot_sklearn_kde(df, support, column='AirTime', bins=50): """ Plots a KDE and a histogram using sklearn.KernelDensity. Uses Gaussian kernels. The optimal bandwidth is calculated according to Silverman's rule of thumb. Parameters ---------- df: A pandas.DataFrame support: A 1-d numpy array. Input data points for the probabilit density function. Returns ------- A matplotlib.axes.Axes instance. """ bw = get_silverman_bandwidth(df, column) kde = KernelDensity(kernel='gaussian', bandwidth=bw) x = df[column] kde.fit(x[:, np.newaxis]) y = kde.score_samples(support[:, np.newaxis]) fig, ax = plt.subplots(figsize=(8, 5)) ax.hist(np.ravel(x), bins=bins, alpha=0.5, color=sns.xkcd_rgb["denim blue"], normed=True) ax.plot(support, np.exp(y)) ax.set_xlabel(column, fontsize=14) ax.set_ylabel('Density', fontsize=14) ax.set_title('Kernel Density Plot', fontsize=14) sns.despine(ax=ax, offset=5, trim=True) return ax
def sklearn_kde(data, points): from sklearn.neighbors import KernelDensity # Silverman bandwidth estimator n, d = data.shape bandwidth = (n * (d + 2) / 4.)**(-1. / (d + 4)) # standardize data so that we can use uniform bandwidth mu, sigma = mean(data, axis=0), std(data, axis=0) data, points = (data - mu)/sigma, (points - mu)/sigma #print("starting grid search for bandwidth over %d points"%n) #from sklearn.grid_search import GridSearchCV #from numpy import logspace #params = {'bandwidth': logspace(-1, 1, 20)} #fitter = GridSearchCV(KernelDensity(), params) #fitter.fit(data) #kde = fitter.best_estimator_ #print("best bandwidth: {0}".format(kde.bandwidth)) #import time; T0 = time.time() kde = KernelDensity(kernel='gaussian', bandwidth=bandwidth, rtol=1e-6, atol=1e-6) #print("T:%6.3f fitting"%(time.time()-T0)) kde.fit(data) #print("T:%6.3f estimating"%(time.time()-T0)) log_pdf = kde.score_samples(points) #print("T:%6.3f done"%(time.time()-T0)) return exp(log_pdf)
def kde_sklearn(x, x_grid, bandwidth=0.2, **kwargs): """Kernel Density Estimation with Scikit-learn""" kde_skl = KernelDensity(bandwidth=bandwidth, **kwargs) kde_skl.fit(x[:, np.newaxis]) # score_samples() returns the log-likelihood of the samples log_pdf = kde_skl.score_samples(x_grid[:, np.newaxis]) return np.exp(log_pdf)
def kdescatter(xs, ys, log_color=False, atol=1e-4, rtol=1e-4, n_jobs=1, n_samp_scaling=100, n_samp_tuning=1000, ax=None, **kwargs): if ax is None: import matplotlib.pyplot as plt ax = plt kwargs.setdefault('linewidths', 0) kwargs.setdefault('s', 20) kwargs.setdefault('cmap', 'winter') X = np.asarray([xs, ys]).T n = X.shape[0] samp_X = X[np.random.choice(n, min(n_samp_scaling, n), replace=False)] median_sqdist = np.median(euclidean_distances(samp_X, squared=True)) bws = np.logspace(-2, 2, num=10) * np.sqrt(median_sqdist) est = GridSearchCV(KernelDensity(), {'bandwidth': bws}, n_jobs=n_jobs) est.fit(X[np.random.choice(n, min(n_samp_tuning, n), replace=False)]) bw = est.best_params_['bandwidth'] kde = KernelDensity(bandwidth=bw) kde.fit(X) densities = kde.score_samples(X) if not log_color: np.exp(densities, out=densities) ax.scatter(xs, ys, c=densities, **kwargs)
def kde_opt4(df_cell_train_feats, y_train, df_cell_test_feats): def prepare_feats(df): df_new = pd.DataFrame() df_new["hour"] = df["hour"] df_new["weekday"] = df["weekday"] + df["hour"] / 24. df_new["accuracy"] = df["accuracy"].apply(lambda x: np.log10(x)) df_new["x"] = df["x"] df_new["y"] = df["y"] return df_new logging.info("train kde_opt4 model") df_cell_train_feats_kde = prepare_feats(df_cell_train_feats) df_cell_test_feats_kde = prepare_feats(df_cell_test_feats) n_class = len(np.unique(y_train)) y_test_pred = np.zeros((len(df_cell_test_feats_kde), n_class), "d") for i in range(n_class): X = df_cell_train_feats_kde[y_train == i] y_test_pred_i = np.ones(len(df_cell_test_feats_kde), "d") for feat in df_cell_train_feats_kde.columns.values: X_feat = X[feat].values BGK10_output = kdeBGK10(X_feat) if BGK10_output is None: kde = gaussian_kde(X_feat, "scott") kde = gaussian_kde(X_feat, kde.factor * 0.741379) y_test_pred_i *= kde.evaluate(df_cell_test_feats_kde[feat].values) else: bandwidth, mesh, density = BGK10_output kde = KernelDensity(kernel='gaussian', metric='manhattan', bandwidth=bandwidth) kde.fit(X_feat[:, np.newaxis]) y_test_pred_i *= np.exp(kde.score_samples(df_cell_test_feats_kde[feat].values[:, np.newaxis])) y_test_pred[:, i] += y_test_pred_i return y_test_pred
def find_kernel(data, numgrid = 1000, bw = 0.002): Xtrain = data[:,0:2] ytrain = data[2] # Set up the data grid for the contour plot xgrid = np.linspace(-74.1, -73.65, numgrid=1000) ygrid = np.linspace(40.5, 40.8, numgrid=1000) X, Y = np.meshgrid(xgrid, ygrid) xy = np.vstack([Y.ravel(), X.ravel()]).T # Plot map of with distributions of each species fig = plt.figure() # construct a kernel density estimate of the distribution kde = KernelDensity(bandwidth=bw, kernel='gaussian') kde.fit(Xtrain, y = ytrain) # evaluate only on the land: -9999 indicates ocean Z = np.exp(kde.score_samples(xy)) Z = Z.reshape(X.shape) # plot contours of the density levels = np.linspace(0, Z.max(), 25) plt.contourf(X, Y, Z, levels=levels, cmap=plt.cm.Reds) plt.title('BK CRIME') plt.show() return Z
def sklearn_density(sample_points, evaluation_points): """ Estimate the probability density function from which a set of sample points was drawn and return the estimated density at the evaluation points. """ from sklearn.neighbors import KernelDensity # Silverman bandwidth estimator n, d = sample_points.shape bandwidth = (n * (d + 2) / 4.)**(-1. / (d + 4)) # Standardize data so that we can use uniform bandwidth. # Note that we will need to scale the resulting density by sigma to # correct the area. mu, sigma = mean(sample_points, axis=0), std(sample_points, axis=0) data, points = (sample_points - mu)/sigma, (evaluation_points - mu)/sigma #print("starting grid search for bandwidth over %d points"%n) #from sklearn.grid_search import GridSearchCV #from numpy import logspace #params = {'bandwidth': logspace(-1, 1, 20)} #fitter = GridSearchCV(KernelDensity(), params) #fitter.fit(data) #kde = fitter.best_estimator_ #print("best bandwidth: {0}".format(kde.bandwidth)) #import time; T0 = time.time() kde = KernelDensity(kernel='gaussian', bandwidth=bandwidth, rtol=1e-6, atol=1e-6) #print("T:%6.3f fitting"%(time.time()-T0)) kde.fit(data) #print("T:%6.3f estimating"%(time.time()-T0)) log_pdf = kde.score_samples(points) #print("T:%6.3f done"%(time.time()-T0)) return exp(log_pdf)/np.prod(sigma) # undo the x scaling on the data points
def kde_sklearn(data, grid, **kwargs): """ Kernel Density Estimation with Scikit-learn Parameters ---------- data : numpy.array Data points used to compute a density estimator. It has `n x p` dimensions, representing n points and p variables. grid : numpy.array Data points at which the desity will be estimated. It has `m x p` dimensions, representing m points and p variables. Returns ------- out : numpy.array Density estimate. Has `m x 1` dimensions """ kde_skl = KernelDensity(**kwargs) kde_skl.fit(data) # score_samples() returns the log-likelihood of the samples log_pdf = kde_skl.score_samples(grid) return np.exp(log_pdf)
def set_plx_kde(t, bandwidth=0.3, method='sklearn_kde'): """ Set the plx_kde Parameters ---------- t : ndarray float Catalog of parallax measures (units: mas) bandwidth : float Bandwidth for gaussian_kde (optional, 0.01 recommended) method : string Method for density determination (options: scipy_kde, sklearn_kde, blocks) """ global plx_kde if method is 'scipy_kde': if plx_kde is None: # We are only going to allow parallaxes above some minimum value if bandwidth is None: plx_kde = gaussian_kde(t['plx'][t['plx']>0.0]) else: plx_kde = gaussian_kde(t['plx'][t['plx']>0.0], bw_method=bandwidth) elif method is 'sklearn_kde': if plx_kde is None: kwargs = {'kernel':'tophat'} if bandwidth is None: plx_kde = KernelDensity(**kwargs) else: plx_kde = KernelDensity(bandwidth=bandwidth, **kwargs) if c.kde_subset: plx_ran = np.copy(t['plx'][t['plx']>0.0]) np.random.shuffle(plx_ran) plx_kde.fit( plx_ran[0:5000, np.newaxis] ) else: plx_kde.fit( t['plx'][t['plx']>0.0][:, np.newaxis] ) elif method is 'blocks': global plx_bins_blocks global plx_hist_blocks # Set up Bayesian Blocks print("Calculating Bayesian Blocks...") nbins = np.min([len(t), 40000]) bins = bayesian_blocks(t['plx'][t['plx']>0.0][0:nbins]) hist, bins = np.histogram(t['plx'][t['plx']>0.0][0:nbins], bins=bins, normed=True) # Pad with zeros plx_bins_blocks = np.append(-1.0e100, bins) hist_pad = np.append(0.0, hist) plx_hist_blocks = np.append(hist_pad, 0.0) print("Bayesian Blocks set.") else: print("You must include a valid method") print("Options: kde or blocks") return
def estimate_density(city): """Return a Gaussian KDE of venues in `city`.""" kde = KernelDensity(bandwidth=175, rtol=1e-4) surround = xp.build_surrounding(DB.venue, city, likes=-1, checkins=1) kde.fit(surround.venues[:, :2]) max_density = approximate_maximum_density(kde, surround.venues[:, :2]) # pylint: disable=E1101 return lambda xy: np.exp(kde.score_samples(xy))/max_density
def time_sklearn(): """ Same as above, for scikit learn """ global bandwidth, npoints, xmin, xmax, data sk_kde = KernelDensity(kernel = 'linear', bandwidth = bandwidth) sk_kde.fit(data[:,np.newaxis]) grid = np.linspace(xmin, xmax, npoints) return np.exp(sk_kde.score_samples(grid[:,np.newaxis]))
def train_patient_flow_estimator(df, bandwidth=1.0): """Train density estimator based on patient metric""" X = df.drop(['ADMIT_DATE'], axis=1).values estimator = KernelDensity(bandwidth=bandwidth, kernel='gaussian', metric='pyfunc', metric_params={'func': patient_metric}) estimator.fit(X) return estimator
def test_sample_weight_invalid(): # Check sample weighting raises errors. kde = KernelDensity() data = np.reshape([1.0, 2.0, 3.0], (-1, 1)) sample_weight = [0.1, -0.2, 0.3] expected_err = "Negative values in data passed to `sample_weight`" with pytest.raises(ValueError, match=expected_err): kde.fit(data, sample_weight=sample_weight)
def get_renyi_entropy(point, order, finding_status=None, pen_status=None): if finding_status is not None: point = Features.get_point_on(point, pen_status, finding_status) samples = np.array(point).reshape(-1, 1) kernel_density = KernelDensity(kernel='gaussian', bandwidth=0.2) kernel_density.fit(samples) log_probability = kernel_density.score_samples(samples) probability = np.exp(log_probability) return Features.__get_renyi_entropy(probability, order)
def CrossValidationScore(Xs,h, kernel='gaussian'): kde = KernelDensity(h, kernel=kernel) ret = 0. for i in range(len(Xs)): x = np.concatenate([Xs[0:i],Xs[i+1:-1]]) kde.fit(x) ret +=kde.score_samples(Xs[i].reshape(1,-1)) ret/=(1.*len(Xs)) return ret
def test_sample_weight_invalid(): # Check sample weighting raises errors. kde = KernelDensity() data = np.reshape([1., 2., 3.], (-1, 1)) sample_weight = [0.1, -0.2, 0.3] expected_err = "sample_weight must have positive values" with pytest.raises(ValueError, match=expected_err): kde.fit(data, sample_weight=sample_weight)
class TwoClassKDE(object): """Class for Kernel Density Estimator on two labels. Likelihood ratio at a point is ratio of class-1 likelihood estimate to class-0 likelihood estimate, times the class odds, where this is calculated as the posterior mean estimate under Beta(1, 1) prior, given the observations. If no points are observed for one of the classes, a default (improper) uniform prior is assumed for that class. """ def __init__(self, **kwargs): """Takes same parameters as KernelDensity estimator.""" self.kde0 = KernelDensity(**kwargs) self.kde1 = KernelDensity(**kwargs) def fit(self, X, y): """Fits KDE models on the data. X is array of data points, y is array of 0-1 labels.""" y = np.asarray(y, dtype = int) self.n0, self.n1 = (y == 0).sum(), (y == 1).sum() assert (self.n0 + self.n1 == len(y)), "y must be vector of 1's and 0's." X0, X1 = X[y == 0], X[y == 1] if (self.n0 > 0): self.kde0.fit(X0) if (self.n1 > 0): self.kde1.fit(X1) def fit_with_optimal_bandwidth(self, X, y, gridsize = 101, dynamic_range = 100, cv = 10, verbose = 0, n_jobs = 1): """Determines optimal bandwidth using the following strategy: For each subset (0 or 1) of the dataset, 1) set b = 1.06 * sigma * n^(-1/5), the Silverman's rule of thumb estimate for the optimal bandwidth. sigma is the sample standard deviation of the samples after zero-centering the columns (note: ideally each column will have comparable variance), 2) set up a grid (of size gridsize) of bandwidth values to try, ranging from b / alpha to b * alpha in geometric progression, where alpha = sqrt(dynamic_range), 3) compute average likelihood of the estimator on the data using cv-fold cross-validation, 4) select the bandwidth with the highest likelihood.""" y = np.asarray(y, dtype = int) self.n0, self.n1 = (y == 0).sum(), (y == 1).sum() assert (self.n0 + self.n1 == len(y)), "y must be vector of 1's and 0's." X0, X1 = X[y == 0], X[y == 1] if (self.n0 > 0): log_b0 = np.log(1.06) + np.log((X0 - X0.mean(axis = 0)).std()) - 0.2 * np.log(self.n0) grid0 = GridSearchCV(self.kde0, {'bandwidth' : np.exp(np.linspace(log_b0 - 0.5 * np.log(dynamic_range), log_b0 + 0.5 * np.log(dynamic_range), gridsize))}, cv = cv, verbose = verbose, n_jobs = n_jobs) grid0.fit(X0) self.kde0 = grid0.best_estimator_ if (self.n1 > 0): log_b1 = np.log(1.06) + np.log((X1 - X1.mean(axis = 0)).std()) - 0.2 * np.log(self.n1) grid1 = GridSearchCV(self.kde1, {'bandwidth' : np.exp(np.linspace(log_b1 - 0.5 * np.log(dynamic_range), log_b1 + 0.5 * np.log(dynamic_range), gridsize))}, cv = cv, verbose = verbose, n_jobs = n_jobs) grid1.fit(X1) self.kde1 = grid1.best_estimator_ def get_params(self, **kwargs): return self.kde0.get_params(**kwargs) def set_params(self, **params): self.kde0.set_params(**params) self.kde1.set_params(**params) return self def score_samples(self, X): """Evaluate the density model on the data. Returns vector of log-likelihood ratios of class 1 over class 0.""" p1_est = (self.n1 + 1) / (self.n0 + self.n1 + 2) class_log_odds = np.log(p1_est) - np.log(1 - p1_est) scores0 = self.kde0.score_samples(X) if (self.n0 > 0) else np.zeros(len(X), dtype = float) scores1 = self.kde1.score_samples(X) if (self.n1 > 0) else np.zeros(len(X), dtype = float) return scores1 - scores0 + class_log_odds def score(self, X, y = None): """Compute the overall log-likelihood ratio under the model.""" return self.score_samples(X).sum() def predict_proba(self, X): """Probability estimates.""" scores = self.score_samples(X) p0s = 1 / (1 + np.exp(scores)) return np.array([p0s, 1 - p0s]).transpose() def predict_log_proba(self, X): """Log of probability estimates.""" return np.log(self.predict_proba(X))
def kde_sklearn(self, x_grid, bandwidth=0.2, **kwargs): """Kernel Density Estimation with Scikit-learn""" from sklearn.neighbors import KernelDensity kde_skl = KernelDensity(bandwidth=bandwidth, **kwargs) kde_skl.fit(self.data[:, np.newaxis]) # score_samples() returns the # log-likelihood of the samples log_pdf = kde_skl.score_samples(x_grid[:, np.newaxis]) return np.exp(log_pdf)
def gaussian_pdf(column): x = column.values x_d = np.linspace(min(x), max(x), 10000) # instantiate and fit the KDE model kde = KernelDensity(bandwidth=0.01, kernel='gaussian') kde.fit(x[:, None]) # score_samples returns the log of the probability density return kde.score_samples(x_d[:, None]), x_d
def calculate_spike_rate_kernel_smoothing(spike_times, end): if len(spike_times) == 0: return np.zeros(1000) x_d = np.linspace(0, end, 1000) spike_times = np.array(spike_times) model = KernelDensity(bandwidth=0.1, kernel='gaussian') model.fit(spike_times[:, None]) log_dens = model.score_samples(x_d[:, None]) whole_bin_spike_rate = len(spike_times) / 2 return np.exp(log_dens) * whole_bin_spike_rate
def kde_estimator(x, y, x_grid, y_grid, bandwidth=0.2, **kwargs): """Kernel Density Estimation with Scikit-learn""" data = np.concatenate((x[:, None], y[:, None]), axis=1) data_grid = np.concatenate((x_grid[:, None], y_grid[:, None]), axis=1) kde_skl = KernelDensity(bandwidth=bandwidth, **kwargs) kde_skl.fit(data) # score_samples() returns the log-likelihood of the samples log_pdf = kde_skl.score_samples(data_grid) return np.exp(log_pdf)
def get_kde(file_path): data = pd.read_excel(file_path) features = np.array(data[[3]]) features = scale(features) kde_ns = KernelDensity(kernel='gaussian', bandwidth=0.15) kde_ns.fit(features[:200]) kde_ds = KernelDensity(kernel='gaussian', bandwidth=0.15) kde_ds.fit(features[-200:]) return kde_ns, kde_ds
def kde_sklearn(self, x_grid, bandwidth=0.2, **kwargs): """Kernel Density Estimation with Scikit-learn""" from sklearn.neighbors import KernelDensity kde_skl = KernelDensity(bandwidth=bandwidth,**kwargs) kde_skl.fit(self.data[:, np.newaxis]) # score_samples() returns the # log-likelihood of the samples log_pdf = kde_skl.score_samples(x_grid[:, np.newaxis]) return np.exp(log_pdf)
def kde2D(x, y, xbins=200, ybins=10, **kwargs): """ Estimate a 2 dimensional pdf via kernel density estimation """ xx, yy = np.mgrid[x.min():x.max():(xbins * 1j), y.min():y.max():(ybins * 1j)] xy_sample = np.vstack([yy.ravel(), xx.ravel()]).T xy_train = np.vstack([y, x]).T kde = KernelDensity(**kwargs) kde.fit(xy_train) # should this be exponential'd? z = np.exp(kde.score_samples(xy_sample)) return xx[:, 0], yy[0, :], np.reshape(z, yy.shape)
def kde3d(x, y, z, data_point): values = np.vstack([x, y, z]).T # Use grid search cross-validation to optimize the bandwidth # params = {'bandwidth': np.logspace(-1, 1, 20)} kde = KernelDensity(bandwidth=0.3) kde.fit(values) kde_coords = kde.sample(10000) log_pdf = kde.score_samples(kde_coords) percentile = np.sum(log_pdf < kde.score(data_point))/10000. return (percentile)
def GetDensity(self, action='generate', samples=100, draws=None): """ TODO: Check density calculations for multiple dimensions. Generate a density estimation of the positions at each time or sample positions from the generated density at a specified time. Parameters ---------- action : (string) - Options: 'generate', 'sample'. 'generate' : Generate a density estimation using kernel density estimation and save it. 'sample' : Generate a density estimation at the final time and both draw and return samples from it equal to the number of points in position. samples : (int) - The number of sample points in each dimension at which to measure the density. Total number of points is samples ** dimensions. draws : (int) - The number of points to draw from the density distribution, if None, draw a number of points equal to the number of points in Positions. Returns ------- 'generate' DensitySamples : (np.array) - An array of the positions of the points used to sample the density. Density : (np.array) - The value of the density evaluated at each points in DensitySamples. 'sample' samples : (np.array) - An array of the samples drawn from the density generated from the positions at the final time. """ if action == 'generate': # A list of sample arrays ranging from the min value to the max value in each dimension. minmax = [ np.linspace(np.amin(self.Positions[:, i, :]), np.amax(self.Positions[:, i, :]), samples) for i in range(self.Positions.shape[1]) ] self.DensitySamples = np.array(list(product(*minmax))) self.Density = np.zeros( (self.DensitySamples.shape[0], self.Times.shape[0])) for i in range(self.Positions.shape[2]): bandwidth = 0.2 * np.mean(pdist(self.Positions[:, :, i])) KDE = KernelDensity(bandwidth=bandwidth, kernel='gaussian', metric='euclidean') KDE.fit(self.Positions[:, :, i]) self.Density[:, i] = np.exp( KDE.score_samples(self.DensitySamples)) elif action == 'sample': bandwidth = min(pdist(self.Positions[-1, :][:, np.newaxis])) KDE = KernelDensity(bandwidth=bandwidth, kernel='gaussian', metric='euclidean') KDE.fit(self.Positions[-1, :][:, np.newaxis]) if draws is None: draws = self.Positions.shape[1] return KDE.sample(draws)
class Density: def __init__(self, params: DensityParams) -> None: self.__params = self.__params_type_checked(params) self.__data = Manager().dict() controller_params = ControllerParams(self.__params, self.__data) self.__controller = Controller(controller_params) self.__density = frombuffer(self.__controller.gridmap.output.get_obj()) self.__kde = KernelDensity(bandwidth=self.__params.kernel.bandwidth, kernel=self.__params.kernel.name, **KDE_PARAMETERS) self.__grid = self.__grid_from_params() # Remove line! @property def control(self) -> Controller: return self.__controller @property def on_grid(self) -> ndarray: return self.__density.reshape(self.__params.grid.shape) def at(self, point: PointAt) -> float: if self.__data: point = self.__point_type_and_range_checked(point) n_points = len(self.__data) self.__kde.fit(self.__data.values()) density = exp(self.__kde.score_samples(point.position[None])) return float(n_points * density) else: return 0.0 def _compute_on_grid(self) -> ndarray: # Remove line! if self.__data: # Remove line! n_points = len(self.__data) # Remove line! self.__kde.fit(self.__data.values()) # Remove line! density_on_grid = exp(self.__kde.score_samples(self.__grid)) # return n_points * density_on_grid.reshape(self.__params.grid.shape) @staticmethod def __params_type_checked(value: DensityParams) -> DensityParams: if type(value) is not DensityParams: raise TypeError('Parameters must be of type <DensityParams>!') return value def __point_type_and_range_checked(self, value: PointAt) -> PointAt: if type(value) is not PointAt: raise TypeError('Data point must be of type <PointAt>!') if not self.__params.bounds.contain(value): raise ValueError('Data point lies outside bounding box!') return value def __grid_from_params(self) -> ndarray: # Remove line! x_line = linspace(*self.__params.bounds.x_range, self.__params.grid.x) y_line = linspace(*self.__params.bounds.y_range, self.__params.grid.y) x_grid, y_grid = meshgrid(x_line, y_line) # Remove line! return column_stack((x_grid.ravel(), y_grid.ravel())) # Remove line!
def data_entropy(X, n_grid=1000, kernel=False, bandwidth=0.2, **kwargs): """ Computes unidimensional entropy from data points. @param X Input matrix [n_samples, n_features]. @param n_grid Number of grid points. Integer. @param kernel Boolean to set kernel method on/off. @param bandwidth Kernel bandwidth. Scalar. @return Vector of [n_features], with the corresponding feature entropy. """ # Testing X dimension if (len(X.shape) > 1): # Computing per axis ent_h = np.apply_along_axis(data_entropy, 0, X, n_grid, kernel,\ bandwidth, **kwargs) ent_h = np.array(ent_h) else: # Finding sample range x_max = X.max() x_min = X.min() # Testing for kernel method if kernel: # Computing random sampling rnd_idx = np.random.choice(X.shape[0], size=n_grid,\ replace=False) # Kernel density estimation kde = KernelDensity(bandwidth=bandwidth, **kwargs) kde.fit(X[rnd_idx, np.newaxis]) # Computing distro x_grid = np.linspace(x_min, x_max, n_grid) pdf = kde.score_samples(x_grid[:, np.newaxis]) # Log-likelihood pdf = np.exp(pdf) # Distribution estimation else: # Computing grid x_grid = np.arange(x_min, x_max, bandwidth) # Computing histogram pdf, _ = np.histogram(X, bins=x_grid, density=True) # Computing entropy ent_h = entropy(pdf) # Return entropy return ent_h
def downsample(self, X, n): # we've already fit()ted, but we're worried that our X is so # large our classifier will be too slow in practice. we can # downsample by running a kde on X and sampling from it (this # will be slow, but happens only once), and then using those # points as the new X. if len(X) < n: return X kde = KernelDensity() kde.fit(X) return kde.sample(n)
class WhitenedKDE(BaseEstimator, DensityMixin): def __init__(self, **kwargs): self.kde = KernelDensity(**kwargs) self.pre_whiten = PCA(whiten=True) def fit(self, X, y=None, sample_weight=None): self.kde.fit(self.pre_whiten.fit_transform(X)) return self def score_samples(self, X): return self.kde.score_samples(self.pre_whiten.transform(X))
def _build(): data = make_example_date() data = pd.concat( [data[data.blobID != 2], data[data.blobID == 2].sample(frac=0.25)]) d = data['feature0'].values density = KernelDensity(bandwidth=0.5, kernel='gaussian') density.fit(d[:, None]) x_d = np.linspace(min(d), max(d), 1000) prob = np.exp(density.score_samples(x_d[:, None])) peaks = find_peaks(prob)[0] return prob, peaks, x_d
def calc_kerneldensity(df, aux_grid): hist_aux = [] for i in range(0, df.shape[1]): kde_skl = KernelDensity(bandwidth=0.4) #aux = np.array(df_n['1000.0']) aux = np.copy(df[:, i]) kde_skl.fit(aux[:, np.newaxis]) log_pdf = kde_skl.score_samples(aux_grid[:, np.newaxis]) hist_aux.append(np.exp(log_pdf) * 100) return hist_aux
def getKDF(data, ax=None, **kwargs): """This function ingests a 1d array or list and returns a plot object of a kernal density function""" data = data.values[:, None] ax = ax or plt.gca() xValues = np.linspace(min(data), max(data), 1000) kdensity = KernelDensity(**kwargs) kdensity.fit(data) logscore = kdensity.score_samples(xValues) return ax.plot(xValues, np.exp(logscore), 'r--', linewidth=3)
def perform_kde(self, data): """Perform kernel density estimation (KDE) Parameters ---------- data : np.array sample data (1D) Returns ------- np.array, np.array, float x values, y values, bandwidth .. _scikit-learn tutorial 1: https://scikit-learn.org/stable/auto_examples/neighbors/plot_kde_1d .html .. _scikit-learn tutorial 2: https://scikit-learn.org/stable/auto_examples/neighbors/plot_digits_ kde_sampling.html .. _A useful article: https://jakevdp.github.io/blog/2013/12/01/kernel-density-estimation/ """ bw = self.bandwidth data_ = data[:, np.newaxis] scaler = StandardScaler() data_ = scaler.fit_transform(data[:, np.newaxis]) estimator = KernelDensity(kernel='gaussian') # grid search optimization if bw == 'grid': kde = self.grid_kde(data_, estimator, self.bw_steps) bw = kde.bandwidth print(' Grid search-optimized bandwidth: {:g}.'.format(bw)) # Silverman's rule-of-thumb elif bw == 'silverman': bw = self.silverman_bw(data) print( ' Bandwidth by Silverman\'s rule-of-thumb: {:g}.'.format(bw)) setattr(estimator, 'bandwidth', bw) kde = estimator.fit(data_) # fixed bandwidth value elif isinstance(bw, float) and 0.1 <= bw <= 1.0: setattr(estimator, 'bandwidth', bw) kde = estimator.fit(data_) else: raise ValueError('Invalid bandwidth: {}.'.format(bw)) # get density function x, y = self.density_func(data_, kde) x = scaler.inverse_transform(x) y = scaler.inverse_transform(y) return x, y, bw
class DensityBasedOneClassClassifier: def __init__(self, threshold=0.95, kernel="gaussian", bandwidth=1.0, metric="euclidean", should_downsample=False, downsample_count=1000): self.should_downsample = should_downsample self.downsample_count = downsample_count self.threshold = threshold self.scaler = preprocessing.StandardScaler() if kernel == "really_linear": self.dens = NegativeMeanDistance(metric=metric) else: self.dens = KernelDensity(bandwidth=bandwidth, kernel=kernel, metric=metric) def fit(self, X): # scale self.scaler.fit(X) self.X = self.scaler.transform(X) # downsample? if self.should_downsample: self.X = self.downsample(self.X, self.downsample_count) # fit self.dens.fit(self.X) # transform relative threshold (eg 95%) to absolute dens = self.get_density(self.X, scale=False) # no need to scale again self.abs_threshold = np.percentile(dens, 100 * (1 - self.threshold)) def get_density(self, X, scale=True): if scale: X = self.scaler.transform(X) # in negative log-prob (for KDE), in negative distance (for NegativeMeanDistance) return self.dens.score_samples(X) def predict(self, X): dens = self.get_density(X) return dens < self.abs_threshold # in both KDE and NMD, lower values are more anomalous def downsample(self, X, n): # we've already fit()ted, but we're worried that our X is so # large our classifier will be too slow in practice. we can # downsample by running a kde on X and sampling from it (this # will be slow, but happens only once), and then using those # points as the new X. if len(X) < n: return X kde = KernelDensity() kde.fit(X) return kde.sample(n)
def testAccuracy(X_train, y_train, X_test, y_test, output_result): for kernel in [ 'gaussian', 'tophat', 'epanechnikov', 'exponential', 'linear', 'cosine' ]: time_start = time.time() model = KernelDensity(kernel=kernel) model.fit(X_train, y_train) time_training = time.time() - time_start output_result(model, X_train, y_train, X_test, y_test, time_training)
def kde2D(x, y, bandwidth, xbins=256j, ybins=256j, **kwargs): xx, yy = np.mgrid[x.min():x.max():xbins, y.min():y.max():ybins] xy_sample = np.vstack([yy.ravel(), xx.ravel()]).T xy_train = np.vstack([y, x]).T kde_skl = KernelDensity(bandwidth=bandwidth, **kwargs) kde_skl.fit(xy_train) z = np.exp(kde_skl.score_samples(xy_sample)) return xx, yy, np.reshape(z, xx.shape)
class KDEDist(object): def __init__(self, bw, kernel='gaussian'): self._bw = bw self._kernel = kernel self._kd = KernelDensity(bandwidth=bw, kernel=kernel) self._samples = None @staticmethod def bw_range(x, n=3): max_pwr = 2 h_opt = np.std(x) * (4. / (3. * len(x)))**0.2 pwrs = np.concatenate([ np.linspace(-max_pwr, 0, n + 1), np.linspace(0, max_pwr, n + 1)[1:] ]) return h_opt * 2**pwrs @property def name(self): return 'KDE({}, {:.5f})'.format(self._kernel, self._bw) @property def samples(self): if self._samples is None: self._samples = self.rvs(100000) return self._samples def dist(self): return self def fit(self, x): self._kd.fit(np.reshape(x, (len(x), 1))) return self def logpdf(self, x): return self._kd.score_samples(np.reshape(x, (len(x), 1))) def rvs(self, n): return self._kd.sample(n).reshape(n) def stats(self, moments='mv'): out = [] if 'm' in moments: out.append(np.array([np.mean(self.samples)])) if 'v' in moments: out.append(np.array([np.var(self.samples)])) if 's' in moments: out.append(np.array([skew(self.samples)])) if 'k' in moments: out.append(np.array([kurtosis(self.samples)])) return tuple(out) def ppf(self, q): return np.percentile(self.samples, q)
def get_mode(vals): h = 1.06 * np.std(vals) * len(vals)**(-1.0 / 5.0) kdf = KernelDensity(bandwidth=h) kdf.fit(np.array(vals).reshape(len(vals), 1)) def neg_kdf(x): return -kdf.score(np.array((x, ))) res = minimize(neg_kdf, x0=np.median(vals), method='Nelder-Mead') assert res.success, res return float(res.x)
def alternate_umi(x, y, k=5, density_estimation_method="kde", k_density=5, bw=.2): assert len(x) == len(y), "Lists should have same length" assert k <= len(x) - 1, "Set k smaller than num. samples - 1" N = len(x) dx = len(x[0]) dy = len(y[0]) data = np.concatenate((x, y), axis=1) tree_xy = ss.cKDTree(data) tree_x = ss.cKDTree(x) tree_y = ss.cKDTree(y) if density_estimation_method.lower() == "kde": kernel = KernelDensity(bandwidth=bw) kernel.fit(x) kde = np.exp(kernel.score_samples(x)) weight = (1 / kde) / np.mean(1 / kde) elif density_estimation_method.lower() == "knn": knn_dis = [ tree_x.query(point, k_density + 1, p=np.inf)[0][k_density] for point in x ] density_estimate = np.array([ float(k_density) / N / knn_dis[i]**dx for i in range(len(knn_dis)) ]) weight = (1 / density_estimate) / np.mean(1 / density_estimate) else: raise ValueError("The density estimation method is not recognized") knn_dis = [tree_xy.query(point, k + 1, p=2)[0][k] for point in data] ans = log(k) + 2 * log(N - 1) - digamma(N) + vd(dx) + vd(dy) - vd(dx + dy) # weight_y = np.zeros(N) # for i in range(N): # weight_y[i] = np.sum(weight[j] for j in tree_y.query_ball_point(y[i], knn_dis[i], p=2)) - weight[i] # weight_y *= N/np.sum(weight_y) for i in range(N): nx = len(tree_x.query_ball_point(x[i], knn_dis[i], p=2)) - 1 ny = len(tree_y.query_ball_point(y[i], knn_dis[i], p=2)) - 1 ans += -weight[i] * log(nx) / N ans += -weight[i] * log(ny) / N # for j in tree_y.query_ball_point(y[i], knn_dis[i], p=2): # ans += -weight[j] * log(weight[j]) /N/ny # ans += -weight[i] * log(weight[i]) / N return ans
def kdensity(x): import numbers if len(x.shape) != 1: raise ValueError("x must be a vector. found " + str(x.shape) + " dimensions") stdx = np.std(x) bw = 1.06 * stdx * len(x)**-.2 if stdx != 0 else 1. kd = KernelDensity(bandwidth=bw) kd.fit(x.reshape(-1, 1)) func = lambda z: np.exp(kd.score_samples(np.array(z).reshape(-1, 1))) return func
class colorKDE(object): def __init__(self,data=np.array([])): self.data = data def runKDE(self,bandwidth=0.2,use_opt=False): ''' Generate the KDE and run with given bandwith If use_opt is specified, ruCVSearch must have been run already ''' if use_opt: self.kde = KernelDensity(bandwidth=self.optimal_bandwidth) else: self.kde = KernelDensity(bandwidth=bandwidth) self.kde.fit(self.data) def runCVSearch(self,search_range=np.linspace(0.01,1.0,50),folds=20): self.grid = GridSearchCV(KernelDensity(),{'bandwidth':search_range},\ cv=folds) self.grid.fit(self.data) self.optimal_bandwidth=self.grid.best_params_['bandwidth'] print 'Optimal bandwidth: ' + str(self.optimal_bandwidth) def score_samples(self,x): ''' Replicate score_samples functionality so both saves can be treated the same ''' return self.kde.score_samples(x) def sample(self,n_samples): ''' Replicate samples functionality so both saves can be treated the same ''' return self.kde.sample(n_samples=n_samples) def save(self,filename,full=True): ''' Save current state of the object If full is false, only save self.kde ''' if full: #save the entire object, including data pickle.dump(self,open(filename,'wb'),protocol=-1) else: #only save the .kde object pickle.dump(self.kde,open(filename,'wb'),protocol=-1)
def __call__(self, **kwargs): """ Runs block of analysis """ from sklearn.neighbors import KernelDensity kde = KernelDensity(bandwidth = self.bandwidth, **kwargs) kde.fit(self.coord.flatten()[:, np.newaxis]) log_pdf = kde.score_samples(self.grid[:, np.newaxis]) pdf = np.exp(log_pdf) self.datasets[self.outputs[0]]["kde"] = pdf
def kde_dist(ax, x, bw=None, color='k'): x_grid = np.linspace(np.min(x), np.max(x), 1000) if bw == None: bw = np.std(x)*float(len(x))**(-1/5.) kde_skl = KernelDensity(bandwidth=bw) kde_skl.fit(x[:, np.newaxis]) # score_samples() returns the log-likelihood of the samples log_pdf = kde_skl.score_samples(x_grid[:, np.newaxis]) est = np.exp(log_pdf) ax.plot(x_grid, est, color=color, lw=0.5) return est
def kde_sklearn(x, x_grid, bandwidth=0.2, **kwargs): """Kernel Density Estimation with Scikit-learn""" # kde from https://jakevdp.github.io/blog/2013/12/01/kernel-density-estimation/ kde_skl = KernelDensity(bandwidth=bandwidth, **kwargs) kde_skl.fit(x[:, np.newaxis]) # score_samples() returns the log-likelihood of the samples log_pdf = kde_skl.score_samples(x_grid[:, np.newaxis]) return np.exp(log_pdf)
def get_n_m_kde(magnitude, bin_centre, area, bandwidth=0.2): """Compute n(m) Density of sources per unit of area in a non-cumulative fashion using a KDE. For this function we need the centre of the bins instead of the edges. **Note that the output is non-cumulative** """ kde_skl = KernelDensity(bandwidth=bandwidth) kde_skl.fit(magnitude[:, np.newaxis]) pdf = np.exp(kde_skl.score_samples(bin_centre[:, np.newaxis])) return pdf/area*len(magnitude)/np.sum(pdf)
def fn(inst): if not 'x' in inst: raise Exception('no x') x = inst['x'] kde = KernelDensity(*args, **margs) kde.fit(x) log_pdf = kde.score_samples(x) pdf = np.exp(log_pdf) return inst.set('model', kde).set('pdf', pdf)
def train_kde(xy, label): params = {"bandwidth": np.logspace(-5, 5, 20), "kernel": ["gaussian", "exponential"]} # do a grid search try: grid = GridSearchCV(KernelDensity(metric="haversine", algorithm="ball_tree"), params) grid.fit(xy) return grid.best_estimator_ except ValueError: k = KernelDensity( metric="haversine", algorithm="ball_tree", bandwidth=best_global_bandwidths[label], kernel="exponential" ) k.fit(xy) return k
def test_kde_algorithm_metric_choice(algorithm, metric): # Smoke test for various metrics and algorithms rng = np.random.RandomState(0) X = rng.randn(10, 2) # 2 features required for haversine dist. Y = rng.randn(10, 2) if algorithm == 'kd_tree' and metric not in KDTree.valid_metrics: assert_raises(ValueError, KernelDensity, algorithm=algorithm, metric=metric) else: kde = KernelDensity(algorithm=algorithm, metric=metric) kde.fit(X) y_dens = kde.score_samples(Y) assert_equal(y_dens.shape, Y.shape[:1])
def mode(x): x = np.array(x) # fit kde kde_skl = KernelDensity() kde_skl.fit(x[:, np.newaxis]) # find max on log grid log_min = np.log(min(x)) / np.log(10) log_max = np.log(max(x)) / np.log(10) x_grid = np.logspace(log_min, log_max, 100000) log_pdf = kde_skl.score_samples(x_grid[:, np.newaxis]) return x_grid[log_pdf.argmax()]
def get_P_binary_v_tot(proj_sep, delta_v_tot, num_sys=100000): """ This function calculates the probability of a random star having the observed proper motion Parameters ---------- proj_sep : float Projected separation between two stars delta_v_tot : float Total velocity difference between two stars Returns ------- P(proj_sep, delta_v_tot) : float Probability that angular separation, pm+RV difference is due to a genuine binary """ # Catalog check global binary_set if binary_set is None: generate_binary_set(num_sys=num_sys) # Use a Gaussian KDE global binary_v_tot_kde # We work in log space for the set of binaries if binary_v_tot_kde is None: kwargs = {'kernel':'tophat'} binary_v_tot_kde = KernelDensity(bandwidth=0.1, **kwargs) binary_v_tot_kde.fit( np.array([np.log10(binary_set['proj_sep']), np.log10(binary_set['delta_v_tot'])]).T ) if isinstance(delta_v_tot, np.ndarray) and isinstance(proj_sep, np.ndarray): values = np.array([np.log10(proj_sep), np.log10(delta_v_tot)]).T prob_binary = np.exp(binary_v_tot_kde.score_samples(values)) elif isinstance(delta_v_tot, np.ndarray): values = np.array([np.log10(proj_sep)*np.ones(len(delta_v_tot)), np.log10(delta_v_tot)]).T prob_binary = np.exp(binary_v_tot_kde.score_samples(values)) else: prob_binary = np.exp(binary_v_tot_kde.score_samples([np.log10(proj_sep), np.log10(delta_v_tot)])) # Convert back from log10-space to linear-space # the log(10) terms convert from log10 to ln prob_binary = prob_binary / (proj_sep*np.log(10.)) / (delta_v_tot*np.log(10.)) return prob_binary
def kde_sklearn(x, x_grid, bandwidth=0.2, **kwargs): """Kernel Density Estimation with Scikit-learn""" kde_skl = KernelDensity(bandwidth=bandwidth, **kwargs) x = ul.fnp(x) x_grid = ul.fnp(x_grid) print (x.shape) # if (x.shape[1] == 1): # x = x[:, np.newaxis] # x_grid = x_grid[:, np.newaxis] kde_skl.fit(x) # score_samples() returns the log-likelihood of the samples log_pdf = kde_skl.score_samples(x_grid) return np.exp(log_pdf)
def kde(data, rng, grid_size=10, **kwargs): """Kernel Density Estimation with Scikit-learn""" n_samples = data.shape[0] n_dims = data.shape[1] bandwidth = (n_samples * (n_dims + 2) / 4.)**(-1. / (n_dims + 4.)) kde_skl = KernelDensity(bandwidth=bandwidth, **kwargs) kde_skl.fit(data) space = [linspace(i[0], i[1], grid_size) for i in rng] grid = meshgrid(*tuple(space)) # score_samples() returns the log-likelihood of the samples log_pdf = kde_skl.score_samples(vstack(map(ravel, grid)).T) return exp(log_pdf), space