def figure_6_14(): """Reproduces figure 6.14 in ESLii displaying a density estimate for sbp levels in chd/no-chd groups using a Gaussian kernel density estimate """ sa = eslii.read_sa_heart_data() sbp = sa["sbp"] sbp_chd = sa[sa["chd"] == 1]["sbp"].copy() sbp_chd.sort() sbp_no_chd = sa[sa["chd"] == 0]["sbp"].copy() sbp_no_chd.sort() kde_chd = KernelDensity(kernel='gaussian', bandwidth=7.5).fit( sbp_chd.reshape(len(sbp_chd), 1)) chd_log_dens = kde_chd.score_samples(sbp_chd.reshape((len(sbp_chd), 1))) plt.subplot(121) plt.plot(sbp_chd, np.exp(chd_log_dens), label="CHD") kde_no_chd = KernelDensity(kernel='gaussian', bandwidth=7.5).fit( sbp_no_chd.reshape(len(sbp_no_chd), 1)) no_chd_log_dens = kde_no_chd.score_samples( sbp_no_chd.reshape((len(sbp_no_chd), 1))) plt.plot(sbp_no_chd, np.exp(no_chd_log_dens), label="no CHD") plt.legend(loc='best') sbp_range = np.linspace(min(sbp), max(sbp), 100).reshape((100, 1)) chd_dens = np.exp(kde_chd.score_samples(sbp_range)) no_chd_dens = np.exp(kde_no_chd.score_samples(sbp_range)) p_chd = float(len(sbp_chd))/(len(sbp_chd) + len(sbp_no_chd)) posterior_est = [p_chd * chd_dens[i] / (p_chd * chd_dens[i] + (1 - p_chd) * no_chd_dens[i]) for i in range(len(sbp_range))] plt.subplot(122) plt.plot(sbp_range, posterior_est) plt.show()
def cistrans(args): cob = co.COB(args.cob) if args.out == None: args.out = '{}_cistrans'.format(cob.name) # np.newaxis adds an empty axis in that position of the slice # the sklearn module requires the values to be in the rows: # http://scikit-learn.org/stable/auto_examples/neighbors/plot_kde_1d.html cis = cob.coex \ .score[cob.coex.distance <= args.cis_distance]\ .values[:,np.newaxis] trans = cob.coex\ .score[np.isinf(cob.coex.distance)]\ .values[:,np.newaxis] X_plot = np.linspace(-10,10,1000)[:,np.newaxis] print( 'Found {:,} cis interactions and {:,} trans interactions'.format( cis.shape[0], trans.shape[0] )) # Fit the kernel kd=KernelDensity(bandwidth=0.2) kd.fit(cis) cis_kde = np.exp(kd.score_samples(X_plot)) plt.fill(X_plot,cis_kde,alpha=0.5,label='Cis Interactions') # Fit the trans kd.fit(trans[0:50000]) trans_kde = np.exp(kd.score_samples(X_plot)) plt.fill(X_plot,trans_kde,alpha=0.5,label='Trans Interactions') plt.legend() plt.title('Cis vs Trans Density: {}'.format(cob.name)) # Calculate the mann whitney U test u,pval = sp.stats.mannwhitneyu(cis[:,0],trans[:,0]) print('P-val: {}'.format(pval)) plt.savefig(args.out+'.png')
def plot_samples(self, folder = '', title = ''): print("Saving plots...") numbins = int(8 * math.log(len(self.samples[self.store_list[0]]))) for p in self.store_list: if p == 'D': continue samples = np.array([self.samples[p]]).T a = np.min(samples) b = np.max(samples) band = 0.1 * (b-a + 0.001) kde = KD(kernel='gaussian', bandwidth=band).fit(samples) n, bins, patches = plt.hist(self.samples[p], numbins, normed=1) log_dens = kde.score_samples(np.array([bins]).T) plt.plot(bins, np.exp(log_dens), 'r-') MAP = self.get_MAP(kde, a, b) self.params['MAP'][p] = MAP plt.plot([MAP], np.exp(kde.score_samples([MAP])), 'go') plt.title(title + " MAP estimate: " + str(MAP)) plt.ylabel("Posterior(" + p + ")") plt.xlabel(p) x1,x2,y1,y2 = plt.axis() plt.axis((-3,3,y1,y2)) if p == 'L' or p == 'T': plt.axis((0,1,y1,y2)) plt.savefig(folder + p + "_" + title) plt.clf() self.params['MAP']['D'] = self.params['D'] if not self.bkt: print("Working on difficulty params...") p = 'D' data = np.array(self.samples[p]) for j in range(self.data['num_problems']): samples = np.array([data[:,j]]).T #print samples a = np.min(samples) b = np.max(samples) band = 0.1 * (b-a + 0.001) kde = KD(kernel='gaussian', bandwidth=band).fit(samples) n, bins, patches = plt.hist(samples, numbins, normed=1) log_dens = kde.score_samples(np.array([bins]).T) plt.plot(bins, np.exp(log_dens), 'r-') MAP = self.get_MAP(kde, a, b) self.params['MAP']['D'][j] = MAP plt.plot([MAP], np.exp(kde.score_samples([MAP])), 'go') plt.title(title + " MAP estimate: " + str(MAP)) plt.ylabel("Posterior(" + p + ")") plt.xlabel("Problem " + str(j)) x1,x2,y1,y2 = plt.axis() plt.axis((-3,3,y1,y2)) plt.savefig(folder + "Difficulty/problem" + str(j) + "_" + title) plt.clf() print("Plots saved!")
class TwoClassKDE(object): """Class for Kernel Density Estimator on two labels. Likelihood ratio at a point is ratio of class-1 likelihood estimate to class-0 likelihood estimate, times the class odds, where this is calculated as the posterior mean estimate under Beta(1, 1) prior, given the observations. If no points are observed for one of the classes, a default (improper) uniform prior is assumed for that class. """ def __init__(self, **kwargs): """Takes same parameters as KernelDensity estimator.""" self.kde0 = KernelDensity(**kwargs) self.kde1 = KernelDensity(**kwargs) def fit(self, X, y): """Fits KDE models on the data. X is array of data points, y is array of 0-1 labels.""" y = np.asarray(y, dtype = int) self.n0, self.n1 = (y == 0).sum(), (y == 1).sum() assert (self.n0 + self.n1 == len(y)), "y must be vector of 1's and 0's." X0, X1 = X[y == 0], X[y == 1] if (self.n0 > 0): self.kde0.fit(X0) if (self.n1 > 0): self.kde1.fit(X1) def fit_with_optimal_bandwidth(self, X, y, gridsize = 101, dynamic_range = 100, cv = 10, verbose = 0, n_jobs = 1): """Determines optimal bandwidth using the following strategy: For each subset (0 or 1) of the dataset, 1) set b = 1.06 * sigma * n^(-1/5), the Silverman's rule of thumb estimate for the optimal bandwidth. sigma is the sample standard deviation of the samples after zero-centering the columns (note: ideally each column will have comparable variance), 2) set up a grid (of size gridsize) of bandwidth values to try, ranging from b / alpha to b * alpha in geometric progression, where alpha = sqrt(dynamic_range), 3) compute average likelihood of the estimator on the data using cv-fold cross-validation, 4) select the bandwidth with the highest likelihood.""" y = np.asarray(y, dtype = int) self.n0, self.n1 = (y == 0).sum(), (y == 1).sum() assert (self.n0 + self.n1 == len(y)), "y must be vector of 1's and 0's." X0, X1 = X[y == 0], X[y == 1] if (self.n0 > 0): log_b0 = np.log(1.06) + np.log((X0 - X0.mean(axis = 0)).std()) - 0.2 * np.log(self.n0) grid0 = GridSearchCV(self.kde0, {'bandwidth' : np.exp(np.linspace(log_b0 - 0.5 * np.log(dynamic_range), log_b0 + 0.5 * np.log(dynamic_range), gridsize))}, cv = cv, verbose = verbose, n_jobs = n_jobs) grid0.fit(X0) self.kde0 = grid0.best_estimator_ if (self.n1 > 0): log_b1 = np.log(1.06) + np.log((X1 - X1.mean(axis = 0)).std()) - 0.2 * np.log(self.n1) grid1 = GridSearchCV(self.kde1, {'bandwidth' : np.exp(np.linspace(log_b1 - 0.5 * np.log(dynamic_range), log_b1 + 0.5 * np.log(dynamic_range), gridsize))}, cv = cv, verbose = verbose, n_jobs = n_jobs) grid1.fit(X1) self.kde1 = grid1.best_estimator_ def get_params(self, **kwargs): return self.kde0.get_params(**kwargs) def set_params(self, **params): self.kde0.set_params(**params) self.kde1.set_params(**params) return self def score_samples(self, X): """Evaluate the density model on the data. Returns vector of log-likelihood ratios of class 1 over class 0.""" p1_est = (self.n1 + 1) / (self.n0 + self.n1 + 2) class_log_odds = np.log(p1_est) - np.log(1 - p1_est) scores0 = self.kde0.score_samples(X) if (self.n0 > 0) else np.zeros(len(X), dtype = float) scores1 = self.kde1.score_samples(X) if (self.n1 > 0) else np.zeros(len(X), dtype = float) return scores1 - scores0 + class_log_odds def score(self, X, y = None): """Compute the overall log-likelihood ratio under the model.""" return self.score_samples(X).sum() def predict_proba(self, X): """Probability estimates.""" scores = self.score_samples(X) p0s = 1 / (1 + np.exp(scores)) return np.array([p0s, 1 - p0s]).transpose() def predict_log_proba(self, X): """Log of probability estimates.""" return np.log(self.predict_proba(X))
def pdf_estimate(images, labels, W, method, t): """ Uses kernel density extimation to the compute the pdf of neural activation data. Args: images (numpy array): input images labels (numpy array): input labels associated with the neuron activations W (numpy array): weights of the hidden neurons method (str): method to approximate the pdf t (float): temperature of the softmax when then network was trained returns: (list of regressor or kde objects): list of marginal pdfs (regressor or kde object): pdf (numpy array): labels of the data points used to compute the pdf (useful to compute prior) """ classes = np.unique(labels) n_classes = len(np.unique(labels)) n_trials = len(labels) """ computes the activation of the hidden neurons for the given input images """ activ = ex.propagate_layerwise(images, W, t=t) n_subsample = 1000 #number of data points to use to compute the pdf in the 'subsample' and 'fit' methods subsample_idx = np.random.choice(n_trials, size=n_subsample, replace=False) activ_subs = activ[subsample_idx, :] n_train_fit = 500 #number of data point to use to fit the pdf in the 'fit' method train_fit_idx = np.random.choice(n_trials, size=n_train_fit, replace=False) activ_fit = activ[train_fit_idx, :] if method=='full': pdf_labels = np.copy(labels) pdf_evidence = KernelDensity(bandwidth=5e-1, kernel='gaussian', rtol=1e-100).fit(activ) pdf_marginals = [] for c in classes: pdf_marginals.append(KernelDensity(bandwidth=5e-1, kernel='gaussian', rtol=1e-100).fit(activ[pdf_labels==c])) if method=='subsample': pdf_labels = labels[subsample_idx] pdf_evidence = KernelDensity(bandwidth=5e-1, kernel='gaussian', rtol=1e-100).fit(activ_subs) pdf_marginals = [] for c in classes: pdf_marginals.append(KernelDensity(bandwidth=5e-1, kernel='gaussian', rtol=1e-100).fit(activ_subs[pdf_labels==c])) if method=='fit': pdf_labels = labels[subsample_idx] pdf_evidence_full = KernelDensity(bandwidth=5e-1, kernel='gaussian', rtol=1e-100).fit(activ_subs) pdf_evidence = KNeighborsRegressor().fit(activ_fit, pdf_evidence_full.score_samples(activ_fit)) pdf_marginals = [] for c in classes: pdf_marginal_full = KernelDensity(bandwidth=5e-1, kernel='gaussian', rtol=1e-100).fit(activ_subs[pdf_labels==c]) pdf_marginals.append(KNeighborsRegressor().fit(activ_fit, pdf_marginal_full.score_samples(activ_fit))) return pdf_marginals, pdf_evidence, pdf_labels
def initialize_optimization_plot(self): if self.dataset is None: print("Set data first before initializing plot options!") return if self.parameters is None: print("Parameter needs to be set!") return ab_list = [ AB_INDICES['A-A'], AB_INDICES['C-C'], AB_INDICES['E-R'], AB_INDICES['R-E'], AB_INDICES['K-E'], AB_INDICES['E-E'], AB_INDICES['K-K'], AB_INDICES['K-R'], AB_INDICES['V-I'], AB_INDICES['I-L'], AB_INDICES['S-T'], AB_INDICES['S-S'], AB_INDICES['K-P'], AB_INDICES['N-N'], AB_INDICES['W-W'], AB_INDICES['G-F'] ] couplings_contacts, couplings_noncontacts, avg_lambda_pair = self.dataset.get_decoy_set(size=self.size_evaluationset) self.evaluation_set['contact'] = np.array(couplings_contacts).transpose() self.evaluation_set['bg'] = np.array(couplings_noncontacts).transpose() bandwidth = 0.01 self.evaluation_set_kde = {} self.evaluation_set_kde['x_grid'] = np.linspace(-0.5, 0.5, 500) self.evaluation_set_kde['contact'] = {} self.evaluation_set_kde['bg'] = {} # kernel density estimate for couplings wijab for ab in ab_list: kde_contact = KernelDensity(kernel='gaussian', bandwidth=bandwidth).fit(self.evaluation_set['contact'][ab].reshape(-1, 1)) kde_bg = KernelDensity(kernel='gaussian', bandwidth=bandwidth).fit(self.evaluation_set['bg'][ab].reshape(-1, 1)) ### add empirical distribution for example data points self.evaluation_set_kde['contact'][ab] = np.exp(kde_contact.score_samples(self.evaluation_set_kde['x_grid'].reshape(-1, 1))) self.evaluation_set_kde['bg'][ab] = np.exp(kde_bg.score_samples(self.evaluation_set_kde['x_grid'].reshape(-1, 1))) #sample points according to regularizer std_dev = np.sqrt(1.0/avg_lambda_pair) regularizer = np.random.normal(scale=std_dev, size=10000) kde_reg = KernelDensity(kernel='gaussian', bandwidth=0.1).fit(regularizer.reshape(-1, 1)) self.evaluation_set_kde['regularizer'] = np.exp(kde_reg.score_samples(self.evaluation_set_kde['x_grid'].reshape(-1, 1)))
def test1(): X = [[1], [2], [4], [3], [2], [8], [8], [9], [10], [12], [11], [9]] kde = KernelDensity(kernel='gaussian', bandwidth=0.4).fit(X) scores = kde.score_samples(X) for x in xrange(len(scores)): scores[x] = math.exp(scores[x]) print scores Y = [[1], [2], [2], [1], [5], [6], [6], [7], [9], [10], [8], [7]] density = kde.score_samples(Y) for x in xrange(len(density)): density[x] = math.exp(density[x]) print density
def get_P_binary_v_tot(proj_sep, delta_v_tot, num_sys=100000): """ This function calculates the probability of a random star having the observed proper motion Parameters ---------- proj_sep : float Projected separation between two stars delta_v_tot : float Total velocity difference between two stars Returns ------- P(proj_sep, delta_v_tot) : float Probability that angular separation, pm+RV difference is due to a genuine binary """ # Catalog check global binary_set if binary_set is None: generate_binary_set(num_sys=num_sys) # Use a Gaussian KDE global binary_v_tot_kde # We work in log space for the set of binaries if binary_v_tot_kde is None: kwargs = {'kernel':'tophat'} binary_v_tot_kde = KernelDensity(bandwidth=0.1, **kwargs) binary_v_tot_kde.fit( np.array([np.log10(binary_set['proj_sep']), np.log10(binary_set['delta_v_tot'])]).T ) if isinstance(delta_v_tot, np.ndarray) and isinstance(proj_sep, np.ndarray): values = np.array([np.log10(proj_sep), np.log10(delta_v_tot)]).T prob_binary = np.exp(binary_v_tot_kde.score_samples(values)) elif isinstance(delta_v_tot, np.ndarray): values = np.array([np.log10(proj_sep)*np.ones(len(delta_v_tot)), np.log10(delta_v_tot)]).T prob_binary = np.exp(binary_v_tot_kde.score_samples(values)) else: prob_binary = np.exp(binary_v_tot_kde.score_samples([np.log10(proj_sep), np.log10(delta_v_tot)])) # Convert back from log10-space to linear-space # the log(10) terms convert from log10 to ln prob_binary = prob_binary / (proj_sep*np.log(10.)) / (delta_v_tot*np.log(10.)) return prob_binary
def plot_agglomerative(): from sklearn.datasets import make_blobs from sklearn.cluster import AgglomerativeClustering from sklearn.neighbors import KernelDensity import matplotlib.pyplot as plt import numpy as np import pandas as pd m = 16 k = 3 X, y = make_blobs(n_samples= m, n_features=2, centers=k, cluster_std=1.3, random_state = 2255) agg = AgglomerativeClustering(n_clusters=3) eps = X.std() / 2. x_min, x_max = X[:, 0].min() - eps, X[:, 0].max() + eps y_min, y_max = X[:, 1].min() - eps, X[:, 1].max() + eps xx, yy = np.meshgrid(np.linspace(x_min, x_max, 100), np.linspace(y_min, y_max, 100)) gridpoints = np.c_[xx.ravel().reshape(-1, 1), yy.ravel().reshape(-1, 1)] ax = plt.gca() for i, x in enumerate(X): ax.text(x[0] + .1, x[1], "%d" % i, horizontalalignment='left', verticalalignment='center') ax.scatter(X[:, 0], X[:, 1], s=20, c='grey') ax.set_xticks(()) ax.set_yticks(()) for i in range((m-1)): agg.n_clusters = X.shape[0] - i agg.fit(X) bins = np.bincount(agg.labels_) for cluster in range(agg.n_clusters): if bins[cluster] > 1: points = X[agg.labels_ == cluster] other_points = X[agg.labels_ != cluster] kde = KernelDensity(bandwidth= 0.9).fit(points) scores = kde.score_samples(gridpoints) score_inside = np.min(kde.score_samples(points)) score_outside = np.max(kde.score_samples(other_points)) levels = .80 * score_inside + .20 * score_outside ax.contour(xx, yy, scores.reshape(100, 100), levels=[levels], colors='k', linestyles='solid', linewidths=0.8) ax.set_xlim(x_min, x_max) ax.set_ylim(y_min, y_max)
def draw_posterior_kld_hist(X_kld, X_vae, f_name, bins=25): """ Plot KDE-smoothed histograms. """ import matplotlib.pyplot as plt # make a figure and configure an axis fig = plt.figure() ax = fig.add_subplot(111) ax.set_xlabel('Posterior KLd Density') ax.set_title('Posterior KLds: Over-regularized vs. Standard') ax.hold(True) for (X, style, label) in [(X_kld, '-', 'ORK'), (X_vae, '--', 'VAR')]: X_samp = X.ravel()[:,np.newaxis] X_min = np.min(X_samp) X_max = np.max(X_samp) X_range = X_max - X_min sigma = X_range / float(bins) plot_min = X_min - (X_range/4.0) plot_max = X_max + (X_range/4.0) plot_X = np.linspace(plot_min, plot_max, 1000)[:,np.newaxis] # make a kernel density estimator for the data in X kde = KernelDensity(kernel='gaussian', bandwidth=sigma).fit(X_samp) ax.plot(plot_X, np.exp(kde.score_samples(plot_X)), linestyle=style, label=label) ax.legend() fig.savefig(f_name, dpi=None, facecolor='w', edgecolor='w', \ orientation='portrait', papertype=None, format='pdf', \ transparent=False, bbox_inches=None, pad_inches=0.1, \ frameon=None) plt.close(fig) return
def kde_opt4(df_cell_train_feats, y_train, df_cell_test_feats): def prepare_feats(df): df_new = pd.DataFrame() df_new["hour"] = df["hour"] df_new["weekday"] = df["weekday"] + df["hour"] / 24. df_new["accuracy"] = df["accuracy"].apply(lambda x: np.log10(x)) df_new["x"] = df["x"] df_new["y"] = df["y"] return df_new logging.info("train kde_opt4 model") df_cell_train_feats_kde = prepare_feats(df_cell_train_feats) df_cell_test_feats_kde = prepare_feats(df_cell_test_feats) n_class = len(np.unique(y_train)) y_test_pred = np.zeros((len(df_cell_test_feats_kde), n_class), "d") for i in range(n_class): X = df_cell_train_feats_kde[y_train == i] y_test_pred_i = np.ones(len(df_cell_test_feats_kde), "d") for feat in df_cell_train_feats_kde.columns.values: X_feat = X[feat].values BGK10_output = kdeBGK10(X_feat) if BGK10_output is None: kde = gaussian_kde(X_feat, "scott") kde = gaussian_kde(X_feat, kde.factor * 0.741379) y_test_pred_i *= kde.evaluate(df_cell_test_feats_kde[feat].values) else: bandwidth, mesh, density = BGK10_output kde = KernelDensity(kernel='gaussian', metric='manhattan', bandwidth=bandwidth) kde.fit(X_feat[:, np.newaxis]) y_test_pred_i *= np.exp(kde.score_samples(df_cell_test_feats_kde[feat].values[:, np.newaxis])) y_test_pred[:, i] += y_test_pred_i return y_test_pred
def kde_sklearn(data, grid, **kwargs): """ Kernel Density Estimation with Scikit-learn Parameters ---------- data : numpy.array Data points used to compute a density estimator. It has `n x p` dimensions, representing n points and p variables. grid : numpy.array Data points at which the desity will be estimated. It has `m x p` dimensions, representing m points and p variables. Returns ------- out : numpy.array Density estimate. Has `m x 1` dimensions """ kde_skl = KernelDensity(**kwargs) kde_skl.fit(data) # score_samples() returns the log-likelihood of the samples log_pdf = kde_skl.score_samples(grid) return np.exp(log_pdf)
def find_kernel(data, numgrid = 1000, bw = 0.002): Xtrain = data[:,0:2] ytrain = data[2] # Set up the data grid for the contour plot xgrid = np.linspace(-74.1, -73.65, numgrid=1000) ygrid = np.linspace(40.5, 40.8, numgrid=1000) X, Y = np.meshgrid(xgrid, ygrid) xy = np.vstack([Y.ravel(), X.ravel()]).T # Plot map of with distributions of each species fig = plt.figure() # construct a kernel density estimate of the distribution kde = KernelDensity(bandwidth=bw, kernel='gaussian') kde.fit(Xtrain, y = ytrain) # evaluate only on the land: -9999 indicates ocean Z = np.exp(kde.score_samples(xy)) Z = Z.reshape(X.shape) # plot contours of the density levels = np.linspace(0, Z.max(), 25) plt.contourf(X, Y, Z, levels=levels, cmap=plt.cm.Reds) plt.title('BK CRIME') plt.show() return Z
def sklearn_density(sample_points, evaluation_points): """ Estimate the probability density function from which a set of sample points was drawn and return the estimated density at the evaluation points. """ from sklearn.neighbors import KernelDensity # Silverman bandwidth estimator n, d = sample_points.shape bandwidth = (n * (d + 2) / 4.)**(-1. / (d + 4)) # Standardize data so that we can use uniform bandwidth. # Note that we will need to scale the resulting density by sigma to # correct the area. mu, sigma = mean(sample_points, axis=0), std(sample_points, axis=0) data, points = (sample_points - mu)/sigma, (evaluation_points - mu)/sigma #print("starting grid search for bandwidth over %d points"%n) #from sklearn.grid_search import GridSearchCV #from numpy import logspace #params = {'bandwidth': logspace(-1, 1, 20)} #fitter = GridSearchCV(KernelDensity(), params) #fitter.fit(data) #kde = fitter.best_estimator_ #print("best bandwidth: {0}".format(kde.bandwidth)) #import time; T0 = time.time() kde = KernelDensity(kernel='gaussian', bandwidth=bandwidth, rtol=1e-6, atol=1e-6) #print("T:%6.3f fitting"%(time.time()-T0)) kde.fit(data) #print("T:%6.3f estimating"%(time.time()-T0)) log_pdf = kde.score_samples(points) #print("T:%6.3f done"%(time.time()-T0)) return exp(log_pdf)/np.prod(sigma) # undo the x scaling on the data points
def sklearn_kde(data, points): from sklearn.neighbors import KernelDensity # Silverman bandwidth estimator n, d = data.shape bandwidth = (n * (d + 2) / 4.)**(-1. / (d + 4)) # standardize data so that we can use uniform bandwidth mu, sigma = mean(data, axis=0), std(data, axis=0) data, points = (data - mu)/sigma, (points - mu)/sigma #print("starting grid search for bandwidth over %d points"%n) #from sklearn.grid_search import GridSearchCV #from numpy import logspace #params = {'bandwidth': logspace(-1, 1, 20)} #fitter = GridSearchCV(KernelDensity(), params) #fitter.fit(data) #kde = fitter.best_estimator_ #print("best bandwidth: {0}".format(kde.bandwidth)) #import time; T0 = time.time() kde = KernelDensity(kernel='gaussian', bandwidth=bandwidth, rtol=1e-6, atol=1e-6) #print("T:%6.3f fitting"%(time.time()-T0)) kde.fit(data) #print("T:%6.3f estimating"%(time.time()-T0)) log_pdf = kde.score_samples(points) #print("T:%6.3f done"%(time.time()-T0)) return exp(log_pdf)
def get_density_based_best_sample(X, known_votes, possibilities): total_votes = sum(map(lambda x: len(x), known_votes)) print total_votes X = X.toarray() current_vectors = numpy.copy(X) #print 'X', X #print 'known_votes ', known_votes original_docs = len(X) possibilities = set([x[0] for x in possibilities]) #print possibilities for i, sample in enumerate(known_votes): for k in range(len(sample)): current_vectors = numpy.append(current_vectors, [X[i]], axis=0) #print 'current_vectors ', current_vectors, len(current_vectors) #assert current_vectors != X model = KernelDensity(kernel='gaussian', bandwidth=0.2).fit(current_vectors) scores = model.score_samples(X) if (total_votes % 3): #Explore low density regions sorted_scores = sorted(enumerate(scores), key = lambda x: x[1], reverse=True) else: #Exploit high density regions 1 times out of 3 sorted_scores = sorted(enumerate(scores), key = lambda x: x[1]) #print sorted_scores for i in range(original_docs): if sorted_scores[i][0] in possibilities: #print sorted_scores[i][0] return sorted_scores[i][0] return None
def plot_kde_histogram2(X1, X2, f_name, bins=25): """ Plot KDE-smoothed histogram of the data in X1/X2. Assume data is 1D. """ import matplotlib.pyplot as plt # make a figure and configure an axis fig = plt.figure() ax = fig.add_subplot(111) ax.hold(True) for (X, style) in [(X1, '-'), (X2, '--')]: X_samp = X.ravel()[:,np.newaxis] X_min = np.min(X_samp) X_max = np.max(X_samp) X_range = X_max - X_min sigma = X_range / float(bins) plot_min = X_min - (X_range/3.0) plot_max = X_max + (X_range/3.0) plot_X = np.linspace(plot_min, plot_max, 1000)[:,np.newaxis] # make a kernel density estimator for the data in X kde = KernelDensity(kernel='gaussian', bandwidth=sigma).fit(X_samp) ax.plot(plot_X, np.exp(kde.score_samples(plot_X)), linestyle=style) fig.savefig(f_name, dpi=None, facecolor='w', edgecolor='w', \ orientation='portrait', papertype=None, format=None, \ transparent=False, bbox_inches=None, pad_inches=0.1, \ frameon=None) plt.close(fig) return
def kde_sklearn(x, x_grid, bandwidth=0.2, **kwargs): """Kernel Density Estimation with Scikit-learn""" kde_skl = KernelDensity(bandwidth=bandwidth, **kwargs) kde_skl.fit(x[:, np.newaxis]) # score_samples() returns the log-likelihood of the samples log_pdf = kde_skl.score_samples(x_grid[:, np.newaxis]) return np.exp(log_pdf)
def xy_kde(xy,bandwidth,N_grid=100,levels=[0.8,0.6,0.4,0.2]): x_edges = np.linspace(np.min(xy[:,0]),np.max(xy[:,0]),N_grid+1) y_edges = np.linspace(np.min(xy[:,1]),np.max(xy[:,1]),N_grid+1) x_centres = np.array([x_edges[b] + (x_edges[b+1]-x_edges[b])/2 for b in range(N_grid)]) y_centres = np.array([y_edges[b] + (y_edges[b+1]-y_edges[b])/2 for b in range(N_grid)]) x_grid, y_grid = np.meshgrid(x_centres,y_centres) xy_grid = np.array([np.ravel(x_grid),np.ravel(y_grid)]).T kde = KernelDensity(kernel='gaussian', bandwidth=bandwidth).fit(xy) H = np.exp(kde.score_samples(xy_grid).reshape(N_grid,N_grid)) # this bit is taken from the corner_plot.py method. ###################################### Hflat = H.flatten() inds = np.argsort(Hflat)[::-1] Hflat = Hflat[inds] sm = np.cumsum(Hflat) sm /= sm[-1] V = np.empty(len(levels)) for i, v0 in enumerate(levels): try: V[i] = Hflat[sm <= v0][-1] except: V[i] = Hflat[0] ##################################### V = np.sort(V) return H, V, x_grid, y_grid, bandwidth
def plot_kde_histogram(X, f_name, bins=25): """ Plot KDE-smoothed histogram of the data in X. Assume data is univariate. """ import matplotlib.pyplot as plt X = X.ravel() np.random.shuffle(X) X = X[0:min(X.shape[0], 1000000)] X_samp = X[:,np.newaxis] X_min = np.min(X_samp) X_max = np.max(X_samp) X_range = X_max - X_min sigma = X_range / float(bins) plot_min = X_min - (X_range/3.0) plot_max = X_max + (X_range/3.0) plot_X = np.linspace(plot_min, plot_max, 1000)[:,np.newaxis] # make a kernel density estimator for the data in X kde = KernelDensity(kernel='gaussian', bandwidth=sigma).fit(X_samp) # make a figure fig = plt.figure() ax = fig.add_subplot(111) ax.plot(plot_X, np.exp(kde.score_samples(plot_X))) fig.savefig(f_name, dpi=None, facecolor='w', edgecolor='w', \ orientation='portrait', papertype=None, format=None, \ transparent=False, bbox_inches=None, pad_inches=0.1, \ frameon=None) plt.close(fig) return
def plot_sklearn_kde(df, support, column='AirTime', bins=50): """ Plots a KDE and a histogram using sklearn.KernelDensity. Uses Gaussian kernels. The optimal bandwidth is calculated according to Silverman's rule of thumb. Parameters ---------- df: A pandas.DataFrame support: A 1-d numpy array. Input data points for the probabilit density function. Returns ------- A matplotlib.axes.Axes instance. """ bw = get_silverman_bandwidth(df, column) kde = KernelDensity(kernel='gaussian', bandwidth=bw) x = df[column] kde.fit(x[:, np.newaxis]) y = kde.score_samples(support[:, np.newaxis]) fig, ax = plt.subplots(figsize=(8, 5)) ax.hist(np.ravel(x), bins=bins, alpha=0.5, color=sns.xkcd_rgb["denim blue"], normed=True) ax.plot(support, np.exp(y)) ax.set_xlabel(column, fontsize=14) ax.set_ylabel('Density', fontsize=14) ax.set_title('Kernel Density Plot', fontsize=14) sns.despine(ax=ax, offset=5, trim=True) return ax
def max_prob(df): df_tmp = df.copy() arr = [] for ind in df_tmp.index: row = df_tmp.loc[ind] d = row.dropna().values # d = d.dropna() if len(d)==0: centre = np.NaN arr.append(centre) continue # arr = vals.sort(axis=0) # df_ordered = pd.DataFrame(vals, index=df.index, columns=df.columns) x_grid = np.linspace(d.min(), d.max(), 50) x_grid = x_grid.reshape(-1,1) d = d.reshape(-1,1) kde = KernelDensity().fit(d) log_dens = kde.score_samples(x_grid) vals = np.exp(log_dens).round(4) centre = x_grid[vals.argmax()][0] centre2 = round(centre, 4) # TODO first element adds unnecessary decimal places (use decimal places class to fix) arr.append(centre2) return arr
def test2(): arr = np.concatenate((np.linspace(0, 10, 10), np.linspace(2, 4, 10), np.linspace(7, 10, 10)))[:, np.newaxis] kde = KernelDensity(kernel='gaussian', bandwidth=0.75).fit(arr) X = np.linspace(0,10,1000)[:, np.newaxis] log_dens = kde.score_samples(X) plt.plot(X, log_dens) plt.show()
def surface_density(c, bandwidth=0.2, grid_step=0.02): """ Given particle positions as a coordinate object, compute the surface density using a kernel density estimate. """ if not HAS_SKLEARN: raise ImportError("scikit-learn is required to use this function.") xgrid = np.arange(2., 9.+0.1, grid_step) # deg ygrid = np.arange(26.5, 33.5+0.1, grid_step) # deg shp = (xgrid.size, ygrid.size) meshies = np.meshgrid(xgrid, ygrid) grid = np.vstack(map(np.ravel, meshies)).T x = c.l.degree y = c.b.degree skypos = np.vstack((x,y)).T kde = KernelDensity(bandwidth=bandwidth, kernel='epanechnikov') kde.fit(skypos) dens = np.exp(kde.score_samples(grid)).reshape(meshies[0].shape) log_dens = np.log10(dens) return grid, log_dens
def kdescatter(xs, ys, log_color=False, atol=1e-4, rtol=1e-4, n_jobs=1, n_samp_scaling=100, n_samp_tuning=1000, ax=None, **kwargs): if ax is None: import matplotlib.pyplot as plt ax = plt kwargs.setdefault('linewidths', 0) kwargs.setdefault('s', 20) kwargs.setdefault('cmap', 'winter') X = np.asarray([xs, ys]).T n = X.shape[0] samp_X = X[np.random.choice(n, min(n_samp_scaling, n), replace=False)] median_sqdist = np.median(euclidean_distances(samp_X, squared=True)) bws = np.logspace(-2, 2, num=10) * np.sqrt(median_sqdist) est = GridSearchCV(KernelDensity(), {'bandwidth': bws}, n_jobs=n_jobs) est.fit(X[np.random.choice(n, min(n_samp_tuning, n), replace=False)]) bw = est.best_params_['bandwidth'] kde = KernelDensity(bandwidth=bw) kde.fit(X) densities = kde.score_samples(X) if not log_color: np.exp(densities, out=densities) ax.scatter(xs, ys, c=densities, **kwargs)
def KDE_plt(categories,inter_arrivals): KDEs = [] for i in range(0,len(categories)): X = np.asarray(extract_cat_samples(inter_arrivals,categories,i))#for single inter-arrivals in a category #X = np_matrix(categories[i][0])#for avg(inter-arrival)/person in a category kde = KernelDensity(kernel='gaussian', bandwidth=4).fit(X) KDEs.append(kde) #to use for prob_return() max_sample = max_interarrival_mean(categories,inter_arrivals,i) X_plot = np.linspace(0,1.5*max_sample,2000)[:, np.newaxis] log_dens = kde.score_samples(X_plot) plt.figure(i) plt.plot(X_plot[:, 0], np.exp(log_dens), '-',label="kernel = '{0}'".format('gaussian')) #plt.draw() #plt.pause(0.001) #plt.title("Non-Parametric Density Estimation for category=%s Visitors"%(i)) plt.hist(combine_inner_lists(extract_cat_samples(inter_arrivals,categories,i)),bins=40,normed=1,color="cyan",alpha=.3,label="histogram") #alpha, from 0 (transparent) to 1 (opaque) # plt.hist(np.asarray(categories[i][0]),bins=40,normed=1,color="cyan",alpha=.3,label="histogram") #alpha, from 0 (transparent) to 1 (opaque) plt.xlabel("inter-arrival time (days)") plt.ylabel("PDF") plt.legend() save_as='./app/static/img/cat_result/kde/kdeplt_cat'+str(i)+'.png' # dump result into kde folder plt.savefig(save_as) plt.show(block=False) plt.close(plt.figure(i)) return KDEs
def EstimateDensity(self,name,df,histogram,f,s,ax): # if the desired output is in Histogram format if(histogram): finRes = [] lab = [] for i in xrange(5): res = np.array(df[ df[f] == i][s]) if(res.shape[0]>0): finRes.append(res) lab.append(name[0]+ ' = ' + str(i)) pl.hist(finRes, bins=2, normed=True, histtype='bar',label = lab) # if the desired output is simple plot else: for i in xrange(5): res = np.array(df[ df[f] == i][s]) if(res.shape[0]>0): res = res.reshape(res.shape[0],1) X_plot = np.array(np.linspace(-1, 5,20)).reshape(20,1) kde= KernelDensity(kernel='exponential', bandwidth=0.05) kde.fit(res) log_dens = kde.score_samples(X_plot) ax.plot(X_plot,np.exp(log_dens),label=name[0]+ ' = ' + str(i)) ax.legend() ax.set_title(name[1] + " distrubution for changing " + name[0])
def kde(self, term, bandwidth=2000, samples=1000, kernel='gaussian'): """ Estimate the kernel density of the instances of term in the text. Args: term (str): A stemmed term. bandwidth (int): The kernel bandwidth. samples (int): The number of evenly-spaced sample points. kernel (str): The kernel function. Returns: np.array: The density estimate. """ # Get the offsets of the term instances. terms = np.array(self.terms[term])[:, np.newaxis] # Fit the density estimator on the terms. kde = KernelDensity(kernel=kernel, bandwidth=bandwidth).fit(terms) # Score an evely-spaced array of samples. x_axis = np.linspace(0, len(self.tokens), samples)[:, np.newaxis] scores = kde.score_samples(x_axis) # Scale the scores to integrate to 1. return np.exp(scores) * (len(self.tokens) / samples)
def art_qi2(img, airmask, min_voxels=int(1e3), max_voxels=int(3e5), save_plot=True): r""" Calculates :math:`\text{QI}_2`, based on the goodness-of-fit of a centered :math:`\chi^2` distribution onto the intensity distribution of non-artifactual background (within the "hat" mask): .. math :: \chi^2_n = \frac{2}{(\sigma \sqrt{2})^{2n} \, (n - 1)!}x^{2n - 1}\, e^{-\frac{x}{2}} where :math:`n` is the number of coil elements. :param numpy.ndarray img: input data :param numpy.ndarray airmask: input air mask without artifacts """ from sklearn.neighbors import KernelDensity from scipy.stats import chi2 from mriqc.viz.misc import plot_qi2 # S. Ogawa was born np.random.seed(1191935) data = img[airmask > 0] data = data[data > 0] # Write out figure of the fitting out_file = op.abspath('error.svg') with open(out_file, 'w') as ofh: ofh.write('<p>Background noise fitting could not be plotted.</p>') if len(data) < min_voxels: return 0.0, out_file modelx = data if len(data) < max_voxels else np.random.choice( data, size=max_voxels) x_grid = np.linspace(0.0, np.percentile(data, 99), 1000) # Estimate data pdf with KDE on a random subsample kde_skl = KernelDensity(bandwidth=0.05 * np.percentile(data, 98), kernel='gaussian').fit(modelx[:, np.newaxis]) kde = np.exp(kde_skl.score_samples(x_grid[:, np.newaxis])) # Find cutoff kdethi = np.argmax(kde[::-1] > kde.max() * 0.5) # Fit X^2 param = chi2.fit(modelx[modelx < np.percentile(data, 95)], 32) chi_pdf = chi2.pdf(x_grid, *param[:-2], loc=param[-2], scale=param[-1]) # Compute goodness-of-fit (gof) gof = float(np.abs(kde[-kdethi:] - chi_pdf[-kdethi:]).mean()) if save_plot: out_file = plot_qi2(x_grid, kde, chi_pdf, modelx, kdethi) return gof, out_file
def test_kde_sample_weights(): n_samples = 400 size_test = 20 weights_neutral = np.full(n_samples, 3.) for d in [1, 2, 10]: rng = np.random.RandomState(0) X = rng.rand(n_samples, d) weights = 1 + (10 * X.sum(axis=1)).astype(np.int8) X_repetitions = np.repeat(X, weights, axis=0) n_samples_test = size_test // d test_points = rng.rand(n_samples_test, d) for algorithm in ['auto', 'ball_tree', 'kd_tree']: for metric in ['euclidean', 'minkowski', 'manhattan', 'chebyshev']: if algorithm != 'kd_tree' or metric in KDTree.valid_metrics: kde = KernelDensity(algorithm=algorithm, metric=metric) # Test that adding a constant sample weight has no effect kde.fit(X, sample_weight=weights_neutral) scores_const_weight = kde.score_samples(test_points) sample_const_weight = kde.sample(random_state=1234) kde.fit(X) scores_no_weight = kde.score_samples(test_points) sample_no_weight = kde.sample(random_state=1234) assert_allclose(scores_const_weight, scores_no_weight) assert_allclose(sample_const_weight, sample_no_weight) # Test equivalence between sampling and (integer) weights kde.fit(X, sample_weight=weights) scores_weight = kde.score_samples(test_points) sample_weight = kde.sample(random_state=1234) kde.fit(X_repetitions) scores_ref_sampling = kde.score_samples(test_points) sample_ref_sampling = kde.sample(random_state=1234) assert_allclose(scores_weight, scores_ref_sampling) assert_allclose(sample_weight, sample_ref_sampling) # Test that sample weights has a non-trivial effect diff = np.max(np.abs(scores_no_weight - scores_weight)) assert diff > 0.001 # Test invariance with respect to arbitrary scaling scale_factor = rng.rand() kde.fit(X, sample_weight=(scale_factor * weights)) scores_scaled_weight = kde.score_samples(test_points) assert_allclose(scores_scaled_weight, scores_weight)
fig.subplots_adjust(left=0.05, right=0.95, wspace=0.05) for i in range(2): plt.subplot(1, 2, i + 1) # construct a kernel density estimate of the distribution print(" - computing KDE in spherical coordinates") kde = KernelDensity(bandwidth=0.04, metric='haversine', kernel='gaussian', algorithm='ball_tree') kde.fit(Xtrain[ytrain == i]) # evaluate only on the land: -9999 indicates ocean Z = -9999 + np.zeros(land_mask.shape[0]) Z[land_mask] = np.exp(kde.score_samples(xy)) Z = Z.reshape(X.shape) # plot contours of the density levels = np.linspace(0, Z.max(), 25) plt.contourf(X, Y, Z, levels=levels, cmap=plt.cm.Reds) if basemap: print(" - plot coastlines using basemap") m = Basemap(projection='cyl', llcrnrlat=Y.min(), urcrnrlat=Y.max(), llcrnrlon=X.min(), urcrnrlon=X.max(), resolution='c') m.drawcoastlines()
#kde = KernelDensity(bandwidth=grid.best_params_['bandwidth'], kde = KernelDensity(bandwidth=1, kernel='gaussian') kde.fit(Zarr[:, None]) ### Plots # Remove large values for ease of plotting Zarr = Zarr[(Zarr < 100) & (Zarr > -100)] x_d = np.linspace(-100,100,1000) cfit = cauchy.pdf(x_d,loc=loc,scale=sca) nfit = norm.pdf(x_d,loc=locnorm,scale=scanorm) tfit = t.pdf(x_d,df=dft,loc=loct,scale=scat) logprob_kde = kde.score_samples(x_d[:, None]) pdf_cmb_array = [] for x in x_d: pdf_cmb_array.append(1/ncomb * pdf_cmb(x)) # pdf_cmb_array.append(pdf_cmb(x)) pdf_cmb_array = np.array(pdf_cmb_array) _ = plt.hist(Zarr,bins=100,normed=True,histtype='step') plt.plot(x_d,cfit,'k-') # Cauchy fit plt.plot(x_d,nfit,'k--') # Normal fit #plt.plot(x_d,tfit,'k-.') # Student-t fit plt.plot(x_d,pdf_cmb_array,'r--') # Mixture plt.fill_between(x_d, np.exp(logprob_kde), alpha=0.5)
plt.savefig(output_dir + 'ws_mean_vs_mld_change.pdf') plt.figure() plt.scatter(mean_highest_max_ws[under_ice], mld_change[under_ice], c='k', s=2) plt.xlabel( 'Average of highest 48 hours of wind speeds between float profile pairs (m/s)' ) plt.ylabel('MLD change (m)') plt.savefig(output_dir + 'ws_highest_vs_mld_change.pdf') plt.figure() mld_axis = arange(-50, 50, 1) kde_without_storms \ = KernelDensity(kernel='gaussian',bandwidth=5.0)\ .fit(mld_change[logical_and(under_ice,num_periods_with_storms == 0)].reshape(-1,1)) log_dens_without_storms = kde_without_storms.score_samples( mld_axis.reshape(-1, 1)) kde_with_storms \ = KernelDensity(kernel='gaussian',bandwidth=5.0)\ .fit(mld_change[logical_and(under_ice,num_periods_with_storms > 0)].reshape(-1,1)) log_dens_with_storms = kde_with_storms.score_samples(mld_axis.reshape(-1, 1)) plt.fill_between(mld_axis, exp(log_dens_without_storms), color='k', alpha=0.5, label='Zero storms', zorder=2) plt.fill_between(mld_axis, exp(log_dens_with_storms), color='r', alpha=0.5, label='One or more storms',
class KDE(KernelDensity, BaseDetector): def __init__(self, bandwidth=1.0, algorithm='auto', kernel='gaussian', metric="euclidean", atol=0, rtol=0, contamination=0.1, breadth_first=True, leaf_size=40, metric_params=None, random_state=42): """Kernel density estimation (KDE) Parameters ---------- bandwidth : float The bandwidth of the kernel. algorithm : str The tree algorithm to use. Valid options are ['kd_tree'|'ball_tree'|'auto']. Default is 'auto'. kernel : str The kernel to use. Valid kernels are ['gaussian'|'tophat'|'epanechnikov'|'exponential'|'linear'|'cosine'] Default is 'gaussian'. metric : str The distance metric to use. atol : float The desired absolute tolerance of the result. A larger tolerance will generally lead to faster execution. Default is 0. rtol : float The desired relative tolerance of the result. breadth_first : bool If true (default), use a breadth-first approach to the problem. Otherwise use a depth-first approach. leaf_size : int Specify the leaf size of the underlying tree. metric_params : dict Additional parameters to be passed to the tree for use with the metric. """ self.algorithm = algorithm self.bandwidth = bandwidth self.kernel = kernel self.metric = metric self.atol = atol self.rtol = rtol self.breadth_first = breadth_first self.leaf_size = leaf_size self.metric_params = metric_params self.contamination = contamination self.random_state = random_state # run the choose algorithm code so that exceptions will happen here # we're using clone() in the GenerativeBayes classifier, # so we can't do this kind of logic in __init__ self._choose_algorithm(self.algorithm, self.metric) if bandwidth <= 0: raise ValueError("bandwidth must be positive") if kernel not in VALID_KERNELS: raise ValueError("invalid kernel: '{0}'".format(kernel)) def fit(self, X_train, y_train=None): """Fit KDE. Parameters ---------- X_train : numpy array of shape (n_samples, n_features) The input samples. y_train : numpy array of shape (n_samples,), optional (default=None) The ground truth of the input samples (labels). Returns ------- self : object the fitted estimator. """ X_train = _check_X(X_train) self.model_ = KernelDensity(bandwidth=self.bandwidth, algorithm=self.algorithm, kernel=self.kernel, metric=self.metric, atol=self.atol, rtol=self.rtol, breadth_first=self.breadth_first, leaf_size=self.leaf_size, metric_params=self.metric_params) self.model_.fit(X_train) return self def decision_function(self, X): """Predict raw anomaly scores of X using the fitted detector. After invert_order(): the higher score, the more probability of x that is predicted as abnormal Parameters ---------- X : numpy array of shape (n_samples, n_features) The input samples. Sparse matrices are accepted only if they are supported by the base estimator. Returns ------- anomaly_scores : numpy array of shape (n_samples,) The anomaly score of the input samples. """ # check_is_fitted(self, ['decision_scores_', 'threshold_', 'labels_']) return invert_order(self.model_.score_samples(X)) def predict_proba(self, X): raise NotImplementedError
{'bandwidth': bandwidths}, cv=5, verbose=1) grid.fit(Pdarr) print('Best params:', grid.best_params_) # Instantiate and fit the KDE model print("Instantiate and fit the KDE model") kde = KernelDensity(bandwidth=grid.best_params_['bandwidth'], kernel='gaussian') kde.fit(Pdarr) # Score_samples returns the log of the probability density x_d = np.linspace(0, 100, 1000) logprob = kde.score_samples(x_d[:, None]) ### CORRECT DOMONKOS for cat in ['SC012', 'EK6', 'AR3']: Ptorr = data.loc[data['cathode'] == cat, 'totalPressure'] do = data.loc[data['cathode'] == cat, 'orificeDiameter'] * 0.1 data.loc[data['cathode'] == cat, 'pressureDiameter'] = Ptorr * do Pdarr_corr = np.array(data.pressureDiameter) Pdarr_corr = Pdarr_corr[~np.isnan(Pdarr_corr)] ## KERNEL DENSITY # Calculate best kernel density bandwidth bandwidths = 10**np.linspace(-1, 1, 200) grid = GridSearchCV(KernelDensity(kernel='gaussian'), {'bandwidth': bandwidths},
def sample_and_visualize(tp): pos = pos_dat[:, :, tp] drn = dir_dat[:, :, tp] vel = vel_dat[:, tp] # convert to spherical coords sph_coords = appendSpherical_np(pos) theta = sph_coords[:, 4] # latitude in [-pi/2, pi/2] phi = sph_coords[:, 5] # longitude in [-pi, pi] # compute KD-Tree for fast neighbor search # note: KD-Tree does not support haversine metric kd = KDTree(pos) theta_idx = np.argsort(theta) theta_sorted = theta[theta_idx] phi_idx = np.argsort(phi) phi_sorted = phi[phi_idx] # compute velocity vector components # reference: https://stackoverflow.com/questions/707985/ # calculate-3d-vector-perpendicular-to-plane-described-by-a-point-and-true-north-h origin = [0, 0, 0] north_vec = [0, 0, sph_rad] east_dir = np.cross(north_vec - pos, pos - origin) # points due east, tangent to the sphere north_dir = np.cross(east_dir, pos - origin) # points due north, tangent to the sphere east_dir = norm_vec_list(east_dir) north_dir = norm_vec_list(north_dir) vec_east = np.multiply(east_dir[:,0], drn[:,0]) + np.multiply(east_dir[:,1], drn[:,1]) + \ np.multiply(east_dir[:,2], drn[:,2]) vec_east = np.multiply(vel, vec_east) vec_north = np.multiply(north_dir[:,0], drn[:,0]) + np.multiply(north_dir[:,1], drn[:,1]) + \ np.multiply(north_dir[:,2], drn[:,2]) vec_north = np.multiply(vel, vec_north) # sampling mesh sample_theta = np.linspace(-np.pi / 2, np.pi / 2, 180) sample_phi = np.linspace(-np.pi, np.pi, 360) lats, lons = np.meshgrid(sample_theta, sample_phi) lats_flat = lats.ravel() lons_flat = lons.ravel() sample_X = sph_rad * np.multiply(np.sin(lats_flat + np.pi / 2), np.cos(lons_flat + np.pi)) sample_Y = sph_rad * np.multiply(np.sin(lats_flat + np.pi / 2), np.sin(lons_flat + np.pi)) sample_Z = sph_rad * np.cos(lats_flat + np.pi / 2) # KDE kde = KernelDensity(bandwidth=0.02, metric='haversine') kde.fit(np.vstack([theta, phi]).T, sample_weight=np.divide(1.0, num_agents)) latlon = np.vstack([lats_flat, lons_flat]).T density_est = np.exp(kde.score_samples(latlon)) density_est = density_est.reshape((360, 180)) # compute velocity at sample points vel_east = np.zeros(np.size(sample_X)) vel_north = np.zeros(np.size(sample_X)) nearest_neighbor_dist = 0.005 ind, dist = kd.query_radius(np.vstack([sample_X, sample_Y, sample_Z]).T, nearest_neighbor_dist, count_only=False, return_distance=True) for i in range(np.size(sample_X)): num_neighbors = np.size(ind[i]) vec_east_sum = np.sum(vec_east[ind[i]]) vec_north_sum = np.sum(vec_north[ind[i]]) if num_neighbors > 0: vel_east[i] = np.divide(vec_east_sum, num_neighbors) vel_north[i] = np.divide(vec_north_sum, num_neighbors) vel_east = vel_east.reshape((360, 180)) vel_north = vel_north.reshape((360, 180)) # shift indices by pi/2 and pi vel_east = np.roll(vel_east, (180, 0), axis=(0, 1)) vel_east = np.fliplr(vel_east) vel_north = np.roll(vel_north, (180, 0), axis=(0, 1)) vel_north = np.fliplr(vel_north) ################################### ## plot maps ################################### ytic_vals = np.array(list(np.linspace(0, 180, 6) - 90)) xtic_vals = np.array(list(np.linspace(-180, 180, 10))) plt.figure(figsize=(8, 3 * 3), dpi=300) plt.subplot(311) plt.imshow(density_est.T) cbar = plt.colorbar() cbar.ax.tick_params(labelsize=6) plt.clim(0, 1) plt.yticks(np.linspace(0, 180, 6), ytic_vals.astype(int), fontsize=7) plt.xticks(np.linspace(0, 360, 10), xtic_vals.astype(int), fontsize=7) plt.xlabel("Longitude", fontsize=8) plt.ylabel("Latitude", fontsize=8) plt.title("Local Density", fontsize=8) plt.subplot(312) plt.imshow(vel_north.T) cbar = plt.colorbar() cbar.ax.tick_params(labelsize=6) plt.clim(-v_max, v_max) plt.yticks(np.linspace(0, 180, 6), ytic_vals.astype(int), fontsize=7) plt.xticks(np.linspace(0, 360, 10), xtic_vals.astype(int), fontsize=7) plt.xlabel("Longitude", fontsize=8) plt.ylabel("Latitude", fontsize=8) plt.title("Velocity (North)", fontsize=8) plt.subplot(313) plt.imshow(vel_east.T) cbar = plt.colorbar() cbar.ax.tick_params(labelsize=6) plt.clim(-v_max, v_max) plt.yticks(np.linspace(0, 180, 6), ytic_vals.astype(int), fontsize=7) plt.xticks(np.linspace(0, 360, 10), xtic_vals.astype(int), fontsize=7) plt.xlabel("Longitude", fontsize=8) plt.ylabel("Latitude", fontsize=8) plt.title("Velocity (East)", fontsize=8) plt.subplots_adjust(wspace=0.2, hspace=0.4) plt.savefig(map_folder + os.sep + repr(tp).zfill(3) + ".png") plt.close() ################################### ## plot agents ################################### plot_quiver_flag = False R = sph_rad theta_val = theta_sorted[ theta_idx.argsort()] + np.pi / 2 # latitude in [0, pi] phi_val = phi_sorted[phi_idx.argsort()] + np.pi # longitude in [0, 2*pi] X_coord = R * np.sin(theta_val) * np.cos(phi_val) Y_coord = R * np.sin(theta_val) * np.sin(phi_val) Z_coord = R * np.cos(theta_val) # sphere parameterization u, v = np.mgrid[0:2 * np.pi:36j, 0:np.pi:18j] xs = R * np.cos(u) * np.sin(v) ys = R * np.sin(u) * np.sin(v) zs = R * np.cos(v) fig = plt.figure(figsize=(3, 3), dpi=300) ax = fig.add_subplot(1, 1, 1, projection='3d') # plot empty plot, with points (without a line) points, = ax.plot([], [], [], 'ro', markersize=0.3, alpha=1.0, fillstyle="full", markerfacecolor="red", markeredgecolor='red', zorder=10) quivers = ax.quiver([], [], [], [], [], [], color='lightblue', linewidth=0.8, normalize=False, zorder=5) # set initial viewing angles azimuth, elev = 75, 21 ax.set_xlim([-R, R]) ax.set_ylim([-R, R]) ax.set_zlim([-R, R]) ax.view_init(elev, azimuth) plot_idx = plot_visible(azimuth, elev, points, X_coord, Y_coord, Z_coord, quivers, drn[:, 0], drn[:, 1], drn[:, 2], vel, plot_quiver_flag) if plot_quiver_flag: rndr = plt.gcf().canvas.get_renderer() quivers.draw(rndr) fig.canvas.draw_idle() ax.plot_surface(xs, ys, zs, linewidth=0.1, zorder=0, edgecolor='gray', color='white', shade=False) plt.axis("off") plt.savefig(abm_folder + os.sep + repr(tp).zfill(3) + ".png") plt.close() ################################### ## plot overlays on sphere ################################### lats_vals = lats + np.pi / 2 lons_vals = lons + np.pi x = R * np.sin(lats_vals) * np.cos(lons_vals) y = R * np.sin(lats_vals) * np.sin(lons_vals) z = R * np.cos(lats_vals) fig = plt.figure(figsize=(4 * 3, 3), dpi=300) # plot density ax1 = fig.add_subplot(1, 3, 1, projection='3d') points, = ax1.plot([], [], [], 'ro', markersize=1.5, alpha=0.5, fillstyle="full", markerfacecolor="red", markeredgecolor='none', zorder=10) azimuth, elev = 75, 21 ax1.set_xlim([-R, R]) ax1.set_ylim([-R, R]) ax1.set_zlim([-R, R]) ax1.view_init(elev, azimuth) ls = LightSource(75, 0) rho_colors = ls.shade(density_est, cmap=cm.viridis, blend_mode='soft', vert_exag=1) rho_plt = ax1.plot_surface(x, y, z, rstride=1, cstride=1, linewidth=0, edgecolor='white', facecolors=rho_colors, antialiased=False, shade=True) cbar1 = fig.colorbar(rho_plt, ax=ax1, shrink=0.75) cbar1.ax.tick_params(labelsize=7) cbar1.mappable.set_clim(0, 1) plt.title("Local Density", fontsize=8) plt.axis("off") # plot velocity (north) ax2 = fig.add_subplot(1, 3, 2, projection='3d') points, = ax2.plot([], [], [], 'ro', markersize=1.5, alpha=0.5, fillstyle="full", markerfacecolor="red", markeredgecolor='none', zorder=10) azimuth, elev = 75, 21 ax2.set_xlim([-R, R]) ax2.set_ylim([-R, R]) ax2.set_zlim([-R, R]) ax2.view_init(elev, azimuth) ls = LightSource(75, 0) vel_north_colors = ls.shade(vel_north, cmap=cm.viridis, blend_mode='soft', vert_exag=1) vel_north_plt = ax2.plot_surface(x, y, z, rstride=1, cstride=1, linewidth=0, edgecolor='white', facecolors=vel_north_colors, antialiased=False, shade=True) cbar2 = fig.colorbar(vel_north_plt, ax=ax2, shrink=0.75) cbar2.ax.tick_params(labelsize=7) cbar2.mappable.set_clim(-v_max, v_max) plt.title("Velocity (North)", fontsize=8) plt.axis("off") # plot velocity (east) ax3 = fig.add_subplot(1, 3, 3, projection='3d') points, = ax3.plot([], [], [], 'ro', markersize=1.5, alpha=0.5, fillstyle="full", markerfacecolor="red", markeredgecolor='none', zorder=10) azimuth, elev = 75, 21 ax3.set_xlim([-R, R]) ax3.set_ylim([-R, R]) ax3.set_zlim([-R, R]) ax3.view_init(elev, azimuth) ls = LightSource(75, 0) vel_east_colors = ls.shade(vel_east, cmap=cm.viridis, blend_mode='soft', vert_exag=1) vel_east_plt = ax3.plot_surface(x, y, z, rstride=1, cstride=1, linewidth=0, edgecolor='white', facecolors=vel_east_colors, antialiased=False, shade=True) cbar3 = fig.colorbar(vel_east_plt, ax=ax3, shrink=0.75) cbar3.ax.tick_params(labelsize=7) cbar3.mappable.set_clim(-v_max, v_max) plt.title("Velocity (East)", fontsize=8) plt.axis("off") # save figure plt.savefig(overlay_folder + os.sep + repr(tp).zfill(3) + ".png") plt.close() ################################### ## save results ################################### feat_vec = np.vstack([ lats.ravel(), lons.ravel(), density_est.ravel(), vel_north.ravel(), vel_east.ravel() ]).T output_fname = npy_folder + os.sep + repr(tp).zfill(3) + ".npy" savedict = {'feat_vec': feat_vec, 'theta': sample_theta, 'phi': sample_phi} np.save(output_fname, savedict) output_fname = mat_folder + os.sep + repr(tp).zfill(3) + ".mat" sio.savemat(output_fname, savedict)
X_plot = np.linspace(-5, 20, 1000)[:, np.newaxis] from collections import Counter fig, ax = plt.subplots() n, bins, patches = ax.hist(X, density=1) c = Counter(x) y = c.values() #ax.plot(bins, y, '--') colors = ['navy', 'cornflowerblue', 'darkorange'] kernels = ['gaussian', 'epanechnikov'] lw = 2 for color, kernel in zip(colors, kernels): kde = KernelDensity(kernel=kernel, bandwidth=0.5).fit(X) log_dens = kde.score_samples(X_plot) ax.plot(X_plot[:, 0], np.exp(log_dens), color=color, lw=lw, linestyle='-', label="kernel = '{0}'".format(kernel)) ax.legend(loc='upper left') ax.plot(X[:, 0], -0.005 - 0.01 * np.random.random(X.shape[0]), '+k') ax.set_xlim(2, 20) ax.set_ylim(-0.02, 1) plt.show() from sklearn.neighbors import KernelDensity
def ShowSingleComponentVariation(self, X, listOfComponents=[0, 1]): #matplotlib.rcParams['font.size'] = 14 showAsTraces = (np.shape(self.objectPixels)[0] == 1) assert (all([(x in range(self.numBasisFunctions)) for x in listOfComponents])) X_rep = self.RepresentUsingModel(X) percentilesToShow = [1, 10, 30, 70, 90, 99] numReadDataSamplePerPercentile = 4 representationPercentiles = [] for percentile in percentilesToShow: representationPercentiles.append( np.percentile(self.dataRepresentation, percentile, axis=0)) medianRepVec = np.percentile(self.dataRepresentation, 50, axis=0) for eigVecInd in listOfComponents: plt.figure() gs = gridspec.GridSpec(numReadDataSamplePerPercentile + 2, len(percentilesToShow)) # calculate the Gaussian smoothed distribution of values along the eignevector direction sigmaOfKDE = 0.12 pdfStart = min(self.dataRepresentation[:, eigVecInd]) - 3 * sigmaOfKDE pdfStop = max(self.dataRepresentation[:, eigVecInd]) + 3 * sigmaOfKDE xAxis = np.linspace(pdfStart, pdfStop, 200) PDF_Model = KernelDensity( kernel='gaussian', bandwidth=sigmaOfKDE).fit( self.dataRepresentation[:, eigVecInd].reshape(-1, 1)) logPDF = PDF_Model.score_samples(xAxis.reshape(-1, 1)) percentileValuesToShow = [ representationPercentiles[x][eigVecInd] for x in range(len(representationPercentiles)) ] percentilesToShowLogPDF = PDF_Model.score_samples( np.array(percentileValuesToShow).reshape(-1, 1)) # show distribution of current component and red dots at the list of precentiles to show plt.subplot(gs[0, :]) plt.fill(xAxis, np.exp(logPDF), fc='b', alpha=0.9) plt.scatter(percentileValuesToShow, np.exp(percentilesToShowLogPDF), c='r', s=300) plt.title( '%.3f%s explained' % (100 * self.PCAModel.explained_variance_ratio_[eigVecInd], '%')) for plotCol, currPrecentile in enumerate(percentilesToShow): currPrecentileRepVec = medianRepVec.copy() currPrecentileRepVec[eigVecInd] = representationPercentiles[ plotCol][eigVecInd] currPrecentileImage = np.zeros(np.shape(self.objectPixels)) currPrecentileImage[ self.objectPixels] = self.ReconstructUsingModel( currPrecentileRepVec).ravel() # show the median image with current precentile as activation of the curr image plt.subplot(gs[1, plotCol]) if showAsTraces: plt.plot(currPrecentileImage) plt.title('precentile: ' + str(percentilesToShow[plotCol]) + '%') elif np.shape(self.objectPixels)[2] == 3: currPrecentileImage[currPrecentileImage > 1] = 1.0 currPrecentileImage[currPrecentileImage < 0] = 0.0 plt.imshow(currPrecentileImage) plt.title('precentile: ' + str(percentilesToShow[plotCol]) + '%') plt.axis('off') else: plt.imshow(currPrecentileImage, cmap='gray') plt.title('precentile: ' + str(percentilesToShow[plotCol]) + '%') plt.axis('off') # find the most suitible candidates in X for current precentile distFromPercentile = abs( X_rep[:, eigVecInd] - representationPercentiles[plotCol][eigVecInd]) X_inds = np.argpartition(distFromPercentile, numReadDataSamplePerPercentile )[:numReadDataSamplePerPercentile] for k, X_ind in enumerate(X_inds): currNearestPrecentileImage = np.zeros( np.shape(self.objectPixels)) currNearestPrecentileImage[self.objectPixels] = X[ X_ind, :].ravel() plt.subplot(gs[2 + k, plotCol]) if showAsTraces: plt.plot(currNearestPrecentileImage) plt.title('Close Neighbor') else: plt.imshow(currNearestPrecentileImage, cmap='gray') plt.title('Close Neighbor') plt.axis('off') plt.tight_layout()
def ShowModelVariations(self, numVariations=6): #matplotlib.rcParams['font.size'] = 14 showAsTraces = (np.shape(self.objectPixels)[0] == 1) numVariations = min(numVariations, self.numBasisFunctions) numVarsPerFigure = min(6, numVariations) numFigures = int(np.ceil(float(numVariations) / numVarsPerFigure)) lowRepVec = np.percentile(self.dataRepresentation, 2, axis=0) medianRepVec = np.percentile(self.dataRepresentation, 50, axis=0) highRepVec = np.percentile(self.dataRepresentation, 98, axis=0) for figureInd in range(numFigures): plt.figure() for plotCol in range(numVarsPerFigure): eigVecInd = numVarsPerFigure * figureInd + plotCol if eigVecInd >= self.numBasisFunctions: break # create the low and high precentile representation activation vectors currLowPrecentileRepVec = medianRepVec.copy() currLowPrecentileRepVec[eigVecInd] = lowRepVec[eigVecInd] currHighPrecentileRepVec = medianRepVec.copy() currHighPrecentileRepVec[eigVecInd] = highRepVec[eigVecInd] # create blank images deltaImage = np.zeros(np.shape(self.objectPixels)) medianImage = np.zeros(np.shape(self.objectPixels)) lowPrecentileImage = np.zeros(np.shape(self.objectPixels)) highPrecentileImage = np.zeros(np.shape(self.objectPixels)) # fill the object pixels with the relevant data deltaImage[self.objectPixels] = self.PCAModel.components_[ eigVecInd, :].ravel() lowPrecentileImage[ self.objectPixels] = self.ReconstructUsingModel( currLowPrecentileRepVec).ravel() medianImage[self.objectPixels] = self.ReconstructUsingModel( medianRepVec).ravel() highPrecentileImage[ self.objectPixels] = self.ReconstructUsingModel( currHighPrecentileRepVec).ravel() # calculate the Gaussian smoothed distribution of values along the eignevector direction sigmaOfKDE = 0.12 pdfStart = min( self.dataRepresentation[:, eigVecInd]) - 3 * sigmaOfKDE pdfStop = max( self.dataRepresentation[:, eigVecInd]) + 3 * sigmaOfKDE xAxis = np.linspace(pdfStart, pdfStop, 200) PDF_Model = KernelDensity( kernel='gaussian', bandwidth=sigmaOfKDE).fit( self.dataRepresentation[:, eigVecInd].reshape(-1, 1)) logPDF = PDF_Model.score_samples(xAxis.reshape(-1, 1)) # show distribution of current component plt.subplot(5, numVarsPerFigure, 0 * numVarsPerFigure + plotCol + 1) plt.fill(xAxis, np.exp(logPDF), fc='b', alpha=0.9) plt.title( '%.3f%s explained' % (100 * self.PCAModel.explained_variance_ratio_[eigVecInd], '%')) # show variance direction (eigenvector) plt.subplot(5, numVarsPerFigure, 1 * numVarsPerFigure + plotCol + 1) if showAsTraces: plt.plot(deltaImage) plt.title('eigenvector ' + str(eigVecInd)) elif np.shape(self.objectPixels)[2] == 3: deltaImage = 0.1 / deltaImage.std() * deltaImage + 0.5 deltaImage[deltaImage > 1] = 1.0 deltaImage[deltaImage < 0] = 0.0 plt.imshow(deltaImage) plt.title('eigenvector ' + str(eigVecInd)) plt.axis('off') else: plt.imshow(deltaImage) plt.title('eigenvector ' + str(eigVecInd)) plt.axis('off') # show 2nd precentile image plt.subplot(5, numVarsPerFigure, 2 * numVarsPerFigure + plotCol + 1) if showAsTraces: plt.plot(lowPrecentileImage) plt.title('2nd precentile') elif np.shape(self.objectPixels)[2] == 3: lowPrecentileImage[lowPrecentileImage > 1] = 1.0 lowPrecentileImage[lowPrecentileImage < 0] = 0.0 plt.imshow(lowPrecentileImage) plt.title('2nd precentile') plt.axis('off') else: plt.imshow(lowPrecentileImage, cmap='gray') plt.title('2nd precentile') plt.axis('off') # show median image plt.subplot(5, numVarsPerFigure, 3 * numVarsPerFigure + plotCol + 1) if showAsTraces: plt.plot(medianImage) plt.title('median') else: plt.imshow(medianImage, cmap='gray') plt.title('median') plt.axis('off') # show 98th precentile image plt.subplot(5, numVarsPerFigure, 4 * numVarsPerFigure + plotCol + 1) if showAsTraces: plt.plot(highPrecentileImage) plt.title('98th precentile') elif np.shape(self.objectPixels)[2] == 3: highPrecentileImage[highPrecentileImage > 1] = 1.0 highPrecentileImage[highPrecentileImage < 0] = 0.0 plt.imshow(highPrecentileImage) plt.title('98th precentile') plt.axis('off') else: plt.imshow(highPrecentileImage, cmap='gray') plt.title('98th precentile') plt.axis('off') plt.tight_layout()
def feature_distribution_plot_mult_modes(feats, feat_names, grp_col, grp_colors, grp_modes, ncol, plot_ranges=None, dens_num=100): """Creats a lattice of histogram plots for the given grouped features, where each group can have a different plot type. Args: feats: A dataframe containing the features (on the columns) and a grouping variable. feat_names: A list of strings of features to plot, matching the column names in feats. grp_col: The name (string) of the column in feats used for grouping. grp_colors: A dict containing the colors (rgb strings) for each group in the grouping column. (E.g. {'group1':'rgb(228,26,28)'}). grp_modes: A dict containing the plotting for each group. Each mode must be one of the following: 'hist' - A histogram 'dens' - A fitted density histogram 'markers' - Scatter plot ncol: The number of columns (int) in the lattice plot. Returns: A plotly figure. """ grps = np.unique(feats[grp_col]) figs = [] cnt = 0 for f in feat_names: if cnt == 0: show_leg = True else: show_leg = False if plot_ranges != None: layout = go.Layout(title=f, titlefont=dict(size=10), autosize=False, xaxis=dict(range=plot_ranges[f])) else: layout = go.Layout(title=f, titlefont=dict(size=10), autosize=False) traces = [] if plot_ranges != None: f_min = plot_ranges[f][0] f_max = plot_ranges[f][1] else: f_min = min(feats.loc[:, f]) f_max = max(feats.loc[:, f]) xpts = np.linspace(f_min, f_max, dens_num) for g in grps: data = feats.loc[feats[grp_col] == g, f] if grp_modes[g] == 'dens': y = np.array(data) y = y[np.isnan(y) == False] kde = KernelDensity(kernel='gaussian', bandwidth=0.2).fit(y[:, np.newaxis]) log_dens = kde.score_samples(xpts[:, np.newaxis]) plt = go.Scatter(x=xpts, y=np.exp(log_dens), mode='lines', line=dict(color=grp_colors[g], width=2), name=g, showlegend=show_leg) elif grp_modes[g] == 'markers': plt = go.Scatter(x=list(data), y=[1.0] * len(data), mode='markers', marker=dict(color=grp_colors[g], size=5), name=g, showlegend=show_leg) else: plt = go.Histogram(x=list(data), marker=Marker(color=grp_colors[g]), name=g, showlegend=show_leg) traces.append(plt) fig = go.Figure(data=traces, layout=layout) figs.append(fig) cnt = cnt + 1 nrow = int(np.ceil(float(len(feat_names)) / float(ncol))) return subplot_helper_fig(nrow, ncol, figs)
def feature_distribution_plot(feats, feat_names, grp_col, grp_colors, ncol, plot_ranges=None, dens_est=False, dens_num=100, title_font=12, all_show_leg=True, lattice=True, titles=None, highlight_samps=[], hl_clr='black'): """Creats a lattice of histogram plots for the given grouped features Args: feats: A dataframe containing the features (on the columns) and a grouping variable. feat_names: A list of strings of features to plot, matching the column names in feats. grp_col: The name (string) of the column in feats used for grouping. grp_colors: A dict containing the colors (rgb strings) for each group in the grouping column. (E.g. {'group1':'rgb(228,26,28)'}). ncol: The number of columns (int) in the lattice plot. Returns: A plotly figure. """ grps = np.unique(feats[grp_col]) figs = [] cnt = 0 for f in feat_names: if cnt == 0: show_leg = True & all_show_leg else: show_leg = False & all_show_leg if titles == None: t = f else: t = titles[f] if plot_ranges != None: layout = go.Layout(title=t, titlefont=dict(size=10), autosize=False, xaxis=dict(range=plot_ranges[f])) else: layout = go.Layout(title=t, titlefont=dict(size=10), autosize=False) traces = [] if plot_ranges != None: f_min = plot_ranges[f][0] f_max = plot_ranges[f][1] else: f_min = 0.7 * min(feats.loc[:, f]) f_max = 1.3 * max(feats.loc[:, f]) xpts = np.linspace(f_min, f_max, dens_num) for g in grps: data = feats.loc[feats[grp_col] == g, f] y = np.array(data) y = y[np.isnan(y) == False] #if (dens_est) & (len(y) > 0) &\ #(len(np.unique(y)) > 3*len(y) / 4): if (dens_est) & (len(y) > 0): kde = KernelDensity(kernel='gaussian', bandwidth=0.2).fit(y[:, np.newaxis]) log_dens = kde.score_samples(xpts[:, np.newaxis]) plt = go.Scatter(x=xpts, y=np.exp(log_dens), mode='lines', line=dict(color=grp_colors[g], width=2), name=g, showlegend=show_leg) traces.append(plt) else: if len(y) > 0: plt = go.Histogram(x=list(data), marker=Marker(color=grp_colors[g]), name=g, showlegend=show_leg) traces.append(plt) if len(highlight_samps) > 0: for hs in highlight_samps: plt = go.Scatter(x=[feats.loc[hs, f]] * 2, y=[0.0, 1.0], line=dict(color=hl_clr, width=2), mode='lines') traces.append(plt) fig = go.Figure(data=traces, layout=layout) fig['layout'].update(titlefont=dict(size=title_font)) figs.append(fig) cnt = cnt + 1 if lattice: nrow = int(np.ceil(float(len(feat_names)) / float(ncol))) return subplot_helper_fig(nrow, ncol, figs) return figs
def plot_scatter(X, out_prefix, title, kde=True): """Draws a 2D scatter plot (png) of the core and accessory distances Also draws contours of kernel density estimare Args: X (numpy.array) n x 2 array of core and accessory distances for n samples. out_prefix (str) Prefix for output plot file (.png will be appended) title (str) The title to display above the plot kde (bool) Whether to draw kernel density estimate contours (default = True) """ # Plot results - max 1M for speed max_plot_samples = 1000000 if X.shape[0] > max_plot_samples: X = utils.shuffle(X, random_state=random.randint( 1, 10000))[0:max_plot_samples, ] # Kernel estimate uses scaled data 0-1 on each axis scale = np.amax(X, axis=0) X /= scale plt.figure(figsize=(11, 8), dpi=160, facecolor='w', edgecolor='k') if kde: xx, yy, xy = get_grid(0, 1, 100) # KDE estimate kde = KernelDensity(bandwidth=0.03, metric='euclidean', kernel='epanechnikov', algorithm='ball_tree') kde.fit(X) z = np.exp(kde.score_samples(xy)) z = z.reshape(xx.shape).T levels = np.linspace(z.min(), z.max(), 10) # Rescale contours plt.contour(xx * scale[0], yy * scale[1], z, levels=levels[1:], cmap='plasma') scatter_alpha = 1 else: scatter_alpha = 0.1 # Plot on correct scale plt.scatter(X[:, 0] * scale[0].flat, X[:, 1] * scale[1].flat, s=1, alpha=scatter_alpha) plt.title(title) plt.xlabel('Core distance (' + r'$\pi$' + ')') plt.ylabel('Accessory distance (' + r'$a$' + ')') plt.savefig(out_prefix + ".png") plt.close()
y = np.array([1, 1, 1, 2, 2, 2]) clf = NearestCentroid() clf.fit(X, y) print(clf.predict([[-0.8, -1]])) from sklearn.neighbors import KernelDensity kde = KernelDensity(bandwidth=0.04, metric='haversine', kernel='gaussian', algorithm='ball_tree', n) #kde.fit(Xtrain[ytrain == i]) kde.fit(X) kde.score_samples(X) from sklearn.datasets import load_iris from sklearn.cluster import KMeans from sklearn.metrics.pairwise import euclidean_distances X, y = load_iris(return_X_y=True) km = KMeans(n_clusters=5, random_state=1).fit(X) dists = euclidean_distances(km.cluster_centers_) import numpy as np tri_dists = dists[np.triu_indices(5, 1)] max_dist, avg_dist, min_dist = tri_dists.max(), tri_dists.mean( ), tri_dists.min()
def get_kde(X_vals, dist, kernel='tophat', bandwidth=0.3): eps=1e-15 scores = np.array(dist) kde = KernelDensity(kernel=kernel, bandwidth=bandwidth).fit(np.log10(1-scores+eps)[:,np.newaxis]) log_dens = kde.score_samples(X_vals) return np.exp(log_dens)
def kl_divergence_error(y, y_hat): kd = KernelDensity(bandwidth=0.75).fit(y.reshape(-1, 1)) yp = kd.score_samples(y.reshape(-1, 1)) kd = KernelDensity(bandwidth=0.75).fit(y_hat.reshape(-1, 1)) ypg = kd.score_samples(y_hat.reshape(-1, 1)) return entropy(yp, ypg)
def main(num_dim, periodic_signal, bw, step, r_fname='training.mat', w_fname="proba.mat"): num_dim = int(num_dim) periodic_signal = int(periodic_signal) bw = float(bw) step = float(step) mat = scipy.io.loadmat(r_fname) keyto_concat = ['X_training', 'Y_training'] to_concat = [] for k in keyto_concat: print mat[k].shape to_concat.append(mat[k]) to_concat[-1] = to_concat[-1][:, 0:num_dim] # select coordinates of Y values = np.hstack(to_concat) print values.shape _, N = values.shape # number of dimensions print 'number of dimensions' print N mins = [] maxs = [] for i in xrange(N): min_tmp = values[:, i].min() max_tmp = values[:, i].max() delta = max_tmp - min_tmp max_tmp = max_tmp + delta / 10. min_tmp = min_tmp - delta / 10. mins.append(min_tmp) maxs.append(max_tmp) mins[0] = 0. maxs[0] = 1. print values # add the same values at X-1 and X+1 to make sure that the estiamted pdf is for a periodic signal if periodic_signal: Xp = values[:, 0] + 1 Xm = values[:, 0] - 1 Y = values[:, 1:] to_concat = [] to_concat.append(Xp) to_concat.append(Y) to_concat = np.column_stack(to_concat) to_concat2 = [] to_concat2.append(to_concat) to_concat2.append(values) values = np.vstack(to_concat2) to_concat = [] to_concat.append(Xm) to_concat.append(Y) to_concat = np.column_stack(to_concat) to_concat2 = [] to_concat2.append(to_concat) to_concat2.append(values) values = np.vstack(to_concat2) print values.shape kde = KernelDensity(bandwidth=bw) kde.fit(values) # generate grid to_exec = "" to_exec += "np.mgrid[" for i in np.arange(N): to_exec += "%f:%f:%fj," % ( mins[i], maxs[i], step ) # select number of samples in each dimension to_exec = to_exec[:-1] to_exec += "]" print to_exec meshes = eval(to_exec) print meshes.shape size_grid = meshes[0].shape print 'size_grid' print size_grid Z = np.vstack([X.reshape(1, X.size) for X in meshes]).transpose() print Z.shape # score_samples() returns the log-likelihood of the samples log_pdf = kde.score_samples(Z) probas = np.exp(log_pdf) print probas.shape probas = probas.transpose().reshape(size_grid) print probas.shape mdict = {'Proba_XY': probas} i = 1 for X in meshes: mdict['X_%d' % i] = X i += 1 print mdict.keys() scipy.io.savemat(w_fname, mdict) proba_plot = probas for i in np.arange(N - 1, 1, -1): proba_plot = proba_plot.sum(i) plotcolormap(np.rot90(proba_plot), extent=[mins[0], maxs[0], mins[1], maxs[1]])
def kde(data): kd = KernelDensity(kernel='tophat', bandwidth=0.02).fit(data) return kd.score_samples(data)
# Note that it is not universal for it to be first minimum. Sometimes # the second minimum is better! print("Positions of the minima: ", min_vals) print("Suggested threshold is the position of the first minimum: ", min_vals[0]) print( "Please verify with the graph. There is a chance subsequent minima may work better." ) print("Elapsed time in seconds:", round(10.0 * (end - beg)) / 10.0) # sklearn, with similar results if use_sklearn: beg2 = time.time() kernel = 'gaussian' kde2 = KernelDensity(kernel=kernel, bandwidth=10).fit(data[:, np.newaxis]) log_dens = kde2.score_samples(xvals[:, np.newaxis]) yvals2 = np.exp(log_dens).reshape(-1) min_pos2 = argrelextrema(yvals2, np.less) min_vals2 = xvals[min_pos2] end2 = time.time() print("Elapsed time for sklearn kernel estimation in seconds:", round(10.0 * (end2 - beg2)) / 10.0) print("Suggested threshold is the position of the first minimum2: ", min_vals2[0]) print("Positions of the minima2: ", min_vals2) # Plot the kernel-density estimate and highlight the minima if not options.no_plot: plt.figure(1) plt.hist(data, bins=100, density=True, label="Data histogram") plt.plot(xvals, yvals, label="KDE", c="red")
def run_benchmark(df_path, n, numScore, tol, cols, bwValue=None, bwMult=1.0, denorm=False, use_std=False): params = { "algorithm": "sklearn", "dataset": df_path, "dim": len(cols), "num_train": n, "num_test": numScore, "train_time": None, "test_time": None, # "num_kernels": None } print(params) data = pd.read_csv(df_path)[cols].iloc[:n].values trainstart = time.time() if bwValue is None: bw = bwMult * estimate_kde_bw(data, use_std=use_std) print("BW: {}".format(bw)) else: bw = bwValue * np.ones(len(cols)) print("BW: {}".format(bwValue)) if numScore is None: numScore = len(data) internal_bw = 1 if denorm: internal_bw = 1.0 / (math.sqrt(2 * math.pi)) scaled_data = (data / bw) * internal_bw # Normalized Computations kde = KernelDensity( bandwidth=internal_bw, kernel='gaussian', algorithm='kd_tree', rtol=tol, ) kde.fit(scaled_data) train_time = time.time() - trainstart params["train_time"] = 1000 * train_time print("Trained in {}".format(train_time), flush=True) scorestart = time.time() scores = np.exp(kde.score_samples(scaled_data[:numScore])) score_time = time.time() - scorestart params["test_time"] = 1000 * score_time print("Scored in {}".format(score_time), flush=True) print("Rate: {}".format(numScore / score_time)) self_density = get_self_density(data.shape[1], data.shape[0]) scores_minus_self = scores - self_density # scale scores back if denorm: final_scores = scores_minus_self else: final_scores = scores_minus_self / np.prod(bw) q = np.percentile(final_scores, 1.0) print("Quantile: {}".format(q)) print("Final Output:") print(params) return final_scores
alpha_cm = plt.cm.Reds alpha_cm._init() alpha_cm._lut[:-3,-1] = abs(np.logspace(0, 1, alpha_cm.N) / 10 - 1)[::-1] aspect = mapdata.shape[0] * 1.0 / mapdata.shape[1] lon_lat_box = (-88, -87.5, 41.6, 42.1) sigthings = traps[traps['WnvPresent'] > 0] sigthings = sigthings.groupby(['Date', 'Trap','Longitude', 'Latitude']).max()['WnvPresent'].reset_index() X = sigthings[['Longitude', 'Latitude']].values kd = KernelDensity(bandwidth=0.02) kd.fit(X) xv,yv = np.meshgrid(np.linspace(-88, -87.5, 100), np.linspace(41.6, 42.1, 100)) gridpoints = np.array([xv.ravel(),yv.ravel()]).T zv = np.exp(kd.score_samples(gridpoints).reshape(100,100)) plt.figure(figsize=(10,14)) plt.imshow(mapdata, cmap=plt.get_cmap('gray'), extent=lon_lat_box, aspect=aspect) plt.imshow(zv, origin='lower', cmap=alpha_cm, extent=lon_lat_box, aspect=aspect) locations = traps[['Longitude', 'Latitude']].values plt.scatter(locations[:,0], locations[:,1], marker='x') plt.savefig('heatmap.png')
print(samples.median()) print(samples.std()) plt.hist(samples['mu_1'], bins=15) plt.hist(samples['mu_2'], bins=15) plt.grid() plt.show() mu1_list = matrix[:, 0] mu2_list = matrix[:, 1] kde1 = KernelDensity(kernel='gaussian', bandwidth=0.5).fit(mu1_list.reshape(-1, 1)) kde2 = KernelDensity(kernel='gaussian', bandwidth=0.5).fit(mu2_list.reshape(-1, 1)) x_plot = np.linspace(0, 10, len(mu1_list)).reshape(-1, 1) prob1 = np.exp(kde1.score_samples(x_plot)) prob2 = np.exp(kde2.score_samples(x_plot)) plt.plot(x_plot, prob1) plt.plot(x_plot, prob2) plt.legend(['mu1', 'mu2']) plt.ylabel('density') plt.title('Kernel Density') plt.grid() plt.show() mixture = [] for i in range(len(mu1_list)): mixture.append(max(prob1[i], prob2[i])) plt.plot(x_plot, mixture) plt.hist(samples['mu_1'], normed=True, bins='fd') plt.hist(samples['mu_2'], normed=True, bins='fd')
def plot(X:'array', threshold_freq:float = 0.0001, nbins:int = None, figsize:tuple = (15,8), supply:bool = False)->tuple: """ Plot histogram with densities estimated by KDE using different kernels. X -- 1D array of data. threshold_freq -- frequency (probability) limit to discard or not a spike as a local maximum (default 0.0001). nbins -- number of bins to be set (default None). In case of not to being included it will be estimated (optimal case). figsize -- figure size (default (15, 8)). supply -- return or not ax object. (default False). return -- ((x/y values of histogram), (x/y values of density for gaussian kernel), list of values where there are local maximums) NOTE - if supply = True, furthermore it will be returned the axis object ax. """ # estimate x limits add = truncate(np.ptp(X)) * 0.5 / 10. xmin = truncate(np.min(X)) - add if truncate(np.min(X)) < 0 else truncate(np.min(X)) + add xmax = truncate(np.max(X)) - add if truncate(np.max(X)) < 0 else truncate(np.max(X)) + add # estimate local maximum steps_avg = local_maximums_kde_gaussian(X.copy(), threshold_freq) #print('local maximums (gaussian): %s'%steps_avg) # number of bins if nbins is None: R = truncate(np.ptp(X)) n = len(X) sigma = np.nanstd(X) nbins = truncate(( R * (n**float(1/3)) ) / 3.49 * sigma ) bins = np.linspace(truncate(np.min(X)), truncate(np.max(X)), nbins) # x for plot X_plot = np.linspace(truncate(np.min(X)), truncate(np.max(X)), 1000)[:, np.newaxis] # create fig/axes fig, ax = plt.subplots(figsize = figsize) # plot the input data distribution h_y, h_x, _ = ax.hist(X , density = True, bins = bins, color = 'grey', label = 'input distribution', alpha = 0.2) # settings colors = ['cornflowerblue', 'darkorange', 'navy'] kernels = ['tophat', 'epanechnikov', 'gaussian'] lw = 2 # calculate kde and plot for color, kernel in zip(colors, kernels): kde = KernelDensity(kernel=kernel, bandwidth=0.5).fit(X) log_dens = kde.score_samples(X_plot) ax.plot(X_plot[:, 0], np.exp(log_dens), color=color, lw=lw, linestyle='-', label="kernel = '{0}'".format(kernel)) # store gaussian results if kernel == 'gaussian': l_y = np.exp(log_dens) l_x = X_plot[:, 0] # set legend ax.legend(loc='upper left') # plot points on the botton if len(X) < 10000: ax.plot(X[:, 0], -0.005 - 0.01 * np.random.random(X.shape[0]), '+k') # plot estimated local maximuns for avg in steps_avg: ax.axvline(avg, color='k', linestyle='--') # set chart limits ax.set_xlim(xmin, xmax) ax.set_ylim(-0.02, np.max(h_y) + 0.05) # set title ax.set_title("%s points / %s bins"%(len(X), nbins)) # set labels ax.set_ylabel("freq") # display / return if supply: return ((h_x, h_y), (l_x, l_y), steps_avg, ax) else: plt.show() return ((h_x, h_y), (l_x, l_y), steps_avg)
])) df2 = train_in.reindex(index=range(0, 5000), columns=list([ 'x_ 1', 'x_ 2', 'x_ 3', 'x_ 4', 'x_ 5', 'x_ 6', 'x_ 7', 'x_ 8', 'x_ 9', 'x_11', 'x_12' ])) df5 = train_in.reindex(index=range(0, 1000), columns=list(['x_ 10', 'x_ 13', 'x_ 14'])) df6 = train_in.reindex(index=range(0, 5000), columns=list(['x_ 10', 'x_ 13', 'x_ 14'])) df3 = train_in.reindex(index=range(0, 2500), columns=list(train_in.columns)) df4 = train_in.reindex(index=range(0, 5000), columns=list(train_in.columns)) #print(len(df1)) #print(len(df2)) kde = KernelDensity(kernel='gaussian', bandwidth=0.75).fit(df3) log_dens = kde.score_samples(df4) print(log_dens) k = np.arange(1, 5001) df = pd.DataFrame({ 'Point_ID': k, 'Output': log_dens }, index=None, columns=['Point_ID', 'Output']) df.to_csv('test_out1.csv', index=False)
def KDE(x, y): kde = KernelDensity(kernel='gaussian', bandwidth=0.2).fit(x) score = sum(kde.score_samples(y[:, ])) return score
def displayFamiliesDistribution(self, directory, label=None): if label is None: self.displayFamiliesDistribution(directory, label=labels_tools.MALICIOUS) self.displayFamiliesDistribution(directory, label=labels_tools.MALICIOUS) return families = self.families[labels_tools.labelBooleanToString(label)] bandwidth = 0.1 num_points = 50 eps = 0.00001 kde = KernelDensity(kernel='gaussian', bandwidth=bandwidth) fig, (ax) = plt.subplots(1, 1) i = 0 for family in families: predictions = families[family] predictions_np = np.asarray(predictions) if i % 3 == 0: linestyle = 'solid' elif i % 3 == 1: linestyle = 'dashed' if i % 3 == 2: linestyle = 'dotted' linewidth = 2 if np.var(predictions_np) < eps: linewidth = 4 mean = np.mean(predictions_np) x = np.arange(0, 1, 0.1) x = np.sort(np.append(x, [mean, mean - eps, mean + eps])) density = [1 if v == mean else 0 for v in x] else: density_predictions = [[x] for x in predictions_np] kde.fit(density_predictions) # Computes the x axis p_max = np.amax(predictions_np) p_min = np.amin(predictions_np) delta = p_max - p_min density_delta = 1.1 * delta x = np.arange(0, 1, density_delta / num_points) x_density = [[y] for y in x] # kde.score_samples returns the 'log' of the density log_density = kde.score_samples(x_density).tolist() density = list(map(math.exp, log_density)) ax.plot(x, density, label=family, linewidth=linewidth, linestyle=linestyle) fig_f, (ax_f) = plt.subplots(1, 1) ax_f.plot(x, density, linewidth=4, color=colors_tools.getLabelColor(label)) ax_f.set_title(family) ax_f.set_xlabel('P(Malicious)') ax_f.set_ylabel('Density') filename = label + '_family_' + family + '_prediction_distributions.png' fig_f.savefig(path.join(directory, filename)) plt.close(fig_f) i += 1 ax.legend(bbox_to_anchor=(0., 0.95, 1., .102), loc=3, ncol=5, mode='expand', borderaxespad=0., fontsize='xx-small') ax.set_xlabel('P(Malicious)') ax.set_ylabel('Density') filename = label + '_families_prediction_distributions.png' fig.savefig(path.join(directory, filename)) plt.close(fig)
class gaussian_kde_wrapper(object): def __init__(self, hyperparameter, param_name, data, oob_strategy='resample', bandwith=0.4): if oob_strategy not in ['resample', 'round', 'ignore']: raise ValueError() self.oob_strategy = oob_strategy self.param_name = param_name self.hyperparameter = hyperparameter reshaped = np.reshape(data, (len(data), 1)) if self.hyperparameter.log: if isinstance(self.hyperparameter, UniformIntegerHyperparameter): # self.probabilities = {val: self.distrib.pdf(np.log2(val)) for val in range(self.hyperparameter.lower, self.hyperparameter.upper)} raise ValueError( 'Log Integer hyperparameter not supported: %s' % param_name) # self.distrib = gaussian_kde(np.log2(data)) # self.distrib = KernelDensity(kernel='gaussian').fit(np.log2(np.reshape(data, (len(data), 1)))) self.distrib = KernelDensity(kernel='gaussian', bandwidth=bandwith).fit( np.log2(reshaped)) else: # self.distrib = gaussian_kde(data) self.distrib = KernelDensity(kernel='gaussian', bandwidth=bandwith).fit(reshaped) pass def pdf(self, x): x = np.reshape(x, (len(x), 1)) if self.hyperparameter.log: x = np.log2(x) log_dens = self.distrib.score_samples(x) return np.exp(log_dens) def rvs(self, *args, **kwargs): # assumes a samplesize of 1, for random search while True: sample = self.distrib.sample( n_samples=1, random_state=kwargs['random_state'])[0][0] if self.hyperparameter.log: value = np.power(2, sample) else: value = sample if isinstance(self.hyperparameter, UniformIntegerHyperparameter): value = int(round(value)) if self.hyperparameter.lower <= value <= self.hyperparameter.upper: return value elif self.oob_strategy == 'ignore': # TODO: hacky fail safe for some hyperparameters if hasattr(self.hyperparameter, 'lower_hard' ) and self.hyperparameter.lower_hard > value: continue if hasattr(self.hyperparameter, 'upper_hard' ) and self.hyperparameter.upper_hard < value: continue return value elif self.oob_strategy == 'round': if value < self.hyperparameter.lower: return self.hyperparameter.lower elif value > self.hyperparameter.upper: return self.hyperparameter.upper
def exe_kde(x_lon, y_lat, year, month, violation_code, kernel, bandwidth, metric): # Build shp shp = get_shp() # Build map fig = plt.figure(dpi = 1000) ax = fig.add_subplot(111) ax.axis('off') map = Basemap(projection = 'cyl', resolution = 'h', lat_0 = 43.0389025, lon_0 = -87.9064736, llcrnrlon = -88.080736, llcrnrlat = 42.917670, urcrnrlon = -87.839722, urcrnrlat = 43.19712) map.readshapefile(shp, name = 'mke_nbhd') patches_mke_nbhd = [] for info, shape in zip(map.mke_nbhd_info, map.mke_nbhd): if info['NEIGHBORHD'] != None: patches_mke_nbhd.append(Polygon(np.array(shape), True)) ax.add_collection(PatchCollection(patches_mke_nbhd, edgecolor = '#000000', facecolor = '#bfbfbf', linewidths = 0.45, zorder = 5)) # Build KDE k, m, kde_bw = kernel.lower(), metric.lower(), float(bandwidth) xy = np.stack([x_lon, y_lat]) d, n = xy.shape[0], xy.shape[1] kde = KernelDensity(kernel = k, bandwidth = kde_bw, metric = m) kde.fit(xy.T) xmin, xmax, ymin, ymax = -88.080736, -87.839722, 42.917670, 43.19712 # For all intents and purposes, this is grid size; in other words, this affects # resolution of the density plots X, Y = np.mgrid[xmin:xmax:1000j, ymin:ymax:1000j] positions = np.vstack([X.ravel(), Y.ravel()]) Z = np.reshape(np.exp(kde.score_samples(positions.T)), X.shape) # Build save cmap = colors.ListedColormap(['#ffffff', '#ffebeb', '#ffd8d8', '#ffc4c4', '#ffb1b1', '#ff9d9d', '#ff8a8a', '#ff7676', '#ff6262', '#ff4f4f', '#ff3b3b', '#ff2828', '#ff1414']) plt.imshow(np.rot90(Z), cmap = cmap, extent = [xmin, xmax, ymin, ymax], alpha = 0.5, zorder = 10) plt.scatter(x_lon, y_lat, c = '#0000ff', s = 1.75, alpha = 0.5, linewidths = 0, edgecolors = None, zorder = 15) s_yr, s_mo, s_vc = str(year), month.lower()[:3], str(violation_code) s_ke, s_bw, s_me = kernel.lower()[:3], str(kde_bw).replace('.', ''), metric.lower()[:3] plot_path_long = path.plot_path_long() plt.savefig(plot_path_long + '%s%s%s%s%s%s.png' % (s_yr, s_mo, s_vc, s_ke, s_bw, s_me), bbox_inches = 'tight', dpi = 1000)
def feature_distribution_plot_users(feats, users, feat_names, grp_col, grp_colors, dens_est=False, dens_num=50): """Creates a lattice of histogram plots for grouped features for many users. Each column contains the histogram feature plots for a single user. Args: feats: A dictionary containing the features (on the columns) and a grouping variable for each user. users: A subset of the keys of 'feats' to plot feat_names: A list of strings of features to plot, matching the column names in feats. grp_col: The name (string) of the column in feats used for grouping. grp_colors: A dict containing the colors (rgb strings) for each group in the grouping column. (E.g. {'group1':'rgb(228,26,28)'}). ncol: The number of columns (int) in the lattice plot. dens_est: A boolean indicating whether or not to show density estimation plot instead of histogram. Returns: A plotly figure. """ grps = np.unique(feats[users[0]][grp_col]) figs = [] cnt = 0 user_plt_cnt = 0 for f in feat_names: vals = [] for u in users: vals.extend(feats[u][f]) fmin = np.nanpercentile(vals, 2) fmax = np.nanpercentile(vals, 98) #fmin = min([min(feats[u][f]) for u in users]) #fmax = max([max(feats[u][f]) for u in users]) feat_plot_cnt = 0 for u in users: if cnt == 0: show_leg = True else: show_leg = False if user_plt_cnt == 0: plt_title = u else: plt_title = '' if feat_plot_cnt == 0: y_title = f else: y_title = '' layout = go.Layout(title=plt_title, titlefont=dict(size=10), yaxis=dict(title=y_title, titlefont=dict(size=18), color='black'), xaxis=dict(range=[fmin, fmax]), autosize=False) f_min = min(feats[u].loc[:, f]) f_max = max(feats[u].loc[:, f]) xpts = np.linspace(f_min, f_max, dens_num) traces = [] for g in grps: data = feats[u].loc[feats[u][grp_col] == g, f] if dens_est: y = np.array(data) y = y[np.isnan(y) == False] kde = KernelDensity(kernel='gaussian', bandwidth=0.75).fit(y[:, np.newaxis]) log_dens = kde.score_samples(xpts[:, np.newaxis]) plt = go.Scatter(x=xpts, y=np.exp(log_dens), mode='lines', line=dict(color=grp_colors[g], width=2), name=g, showlegend=show_leg) else: plt = go.Histogram(x=list(data), marker=Marker(color=grp_colors[g]), name=g, showlegend=show_leg) traces.append(plt) fig = go.Figure(data=traces, layout=layout) figs.append(fig) cnt = cnt + 1 feat_plot_cnt = feat_plot_cnt + 1 user_plt_cnt = user_plt_cnt + 1 ncol = len(users) nrow = len(feat_names) return subplot_helper_fig(nrow, ncol, figs)
def visualize(network, title,pos): """ Visualize the network given an array of posisitons. """ print("-- Starting to Visualize --") colors = [] colori = [] i_edge_colors = [] d_edge_colors = [] default = [] infected = [] nstart = [] ninfect = [] for node in network.nodes(): colorn = network.nodes[node]["color"] if colorn == "#A0C8F0": nstart.append(node) colors.append(network.nodes[node]["color"]) elif colorn == "#30cc1f" or colorn == "red" or colorn == "purple": ninfect.append(node) colori.append(network.nodes[node]["color"]) for i,j in network.edges(): color = network.nodes[i]["color"] if color == "#A0C8F0" or color == "#30cc1f" or color == "purple": color = "#A6A6A6" default.append((i,j)) d_edge_colors.append(color) else: color = "red" infected.append((i,j)) i_edge_colors.append(color) plt.figure(figsize=(30,20)) ax = plt.axes(projection=ccrs.PlateCarree()) ax.coastlines() #make density plot of infection node_positions = {node[0]: (float(node[1]['lon']), float(node[1]['lat'])) for node in network.nodes(data=True)} xp = [] yp = [] for node in network.nodes(): infec = network.nodes[node]["status"] if infec == 'i': xp.append(network.nodes[node]['lon']) yp.append(network.nodes[node]['lat']) if len(xp)>=1: m1, m2 = np.array(xp).astype(np.float), np.array(yp).astype(np.float) xmin = -180 xmax = 180 ymin = -90 ymax = 90 # get the density estimation Xp, Yp = np.mgrid[xmin:xmax:250j, ymin:ymax:250j] XpYp = np.vstack([Xp.ravel(), Yp.ravel()]).T XpYp = np.radians(XpYp) values = np.column_stack((np.array(np.vstack(m1)), np.array(np.vstack(m2)))) kernel = KernelDensity(bandwidth=0.035) kernel.fit(np.radians(values)) #kernel = stats.gaussian_kde(values) Z = np.exp(kernel.score_samples(XpYp)) Z = Z.reshape(Xp.shape) # plot the result cmap = plt.cm.jet cmap.set_under('white') plt.imshow(np.rot90(Z), norm = plt.Normalize(vmin=(Z.max()-(Z.max()*0.9)), vmax=Z.max()), cmap=cmap, extent=[xmin, xmax, ymin, ymax], alpha=0.3, interpolation = 'gaussian') # Fist pass - Gray lines nx.draw_networkx_edges(network,pos=node_positions,edgelist=default, width=0.005, edge_color=d_edge_colors, alpha=0.005, arrows=False) # Second Pass - Colored lines nx.draw_networkx_edges(network,pos=node_positions,edgelist=infected, width=0.1, edge_color=i_edge_colors, alpha=0.25, arrows=False) # first Pass - small nodes nx.draw_networkx_nodes(network, pos=node_positions, nodelist=nstart, linewidths=0.2, node_size=5, with_labels=False, node_color = colors) # # Second Pass - large nodes nx.draw_networkx_nodes(network, pos=node_positions, nodelist=ninfect, linewidths=0.2, node_size=20, with_labels=False, node_color = colori) plt.axis('off') number_files = str(len(os.listdir())) while len(number_files) < 3: number_files = "0" + number_files plt.savefig("infection-{0}.png".format(number_files), bbox_inches='tight', dpi=72 ) plt.show() plt.close()