def fit(self, X, y): sorted_idx = X.argsort(axis=0).flatten() kde_values = X.copy()[sorted_idx] kde_labels = y.copy()[sorted_idx] bin_counts = np.bincount(y).astype(float) mixture = 0.5 old_ratios = np.zeros(kde_labels.shape) iter_count = 0 if (self.bandwidth is None): self.bandwidth = hscott(X) for i in range(self.n_iters): controls_kde = neighbors.KernelDensity(kernel=self.kernel, bandwidth=self.bandwidth) patholog_kde = neighbors.KernelDensity(kernel=self.kernel, bandwidth=self.bandwidth) controls_kde.fit(kde_values[kde_labels == 0]) patholog_kde.fit(kde_values[kde_labels == 1]) controls_score = controls_kde.score_samples(kde_values) controls_score = np.exp(controls_score) * mixture patholog_score = patholog_kde.score_samples(kde_values) patholog_score = np.exp(patholog_score) * (1 - mixture) ratio = controls_score / (controls_score + patholog_score) if (np.all(ratio == old_ratios)): break iter_count += 1 old_ratios = ratio kde_labels = ratio < 0.5 diff_y = np.hstack(([0], np.diff(kde_labels))) if (np.sum(diff_y != 0) == 2 and np.unique(kde_labels).shape[0] == 2): split_y = int(np.all(np.diff(np.where(kde_labels == 0)) == 1)) sizes = [ x.shape[0] for x in np.split(diff_y, np.where(diff_y != 0)[0]) ] split_prior_smaller = (np.mean( kde_values[kde_labels == split_y]) < np.mean( kde_values[kde_labels == (split_y + 1) % 2])) if split_prior_smaller: replace_idxs = np.arange(kde_values.shape[0])[-sizes[2]:] else: replace_idxs = np.arange(kde_values.shape[0])[:sizes[0]] kde_labels[replace_idxs] = (split_y + 1) % 2 bin_counts = np.bincount(kde_labels).astype(float) mixture = bin_counts[0] / bin_counts.sum() if (mixture < 0.10 or mixture > 0.90): break self.controls_kde = controls_kde self.patholog_kde = patholog_kde self.mixture = mixture self.iter_ = iter_count return self
def GridSearchKDE(data): params = {'bandwidth': np.logspace(-3, 3, 50)} grid = GridSearchCV(neighbors.KernelDensity(), params) grid.fit(data) print("best bandwidth: {0}".format(grid.best_estimator_.bandwidth)) params = {'bandwidth': np.linspace(-0.5, 0.5, 50) * grid.best_estimator_.bandwidth} grid = GridSearchCV(neighbors.KernelDensity(), params) grid.fit(data) print("best bandwidth: {0}".format(grid.best_estimator_.bandwidth)) return grid.best_estimator_.bandwidth
def fit(self, data, indices, bandwidth = None): fitData = data[:, indices] fitData, self.scaler = trans(fitData, scale = True) if not bandwidth: bandwidth = GridSearchKDE(fitData) self.kde = neighbors.KernelDensity(bandwidth = bandwidth) self.kde.fit(fitData)
def KernelThresh(image, intens=[0, 40000], num=4000, bandwidth=2000, kernel='gaussian'): """Determine threshold using Gaussian kernel density estimation This is good for bimodal distribution. Using Gaussian kernel density estimation (KDE) to find the two mode of distribution. The threshold is choosen as the middle of the two modes. """ _max_count, _ax, _fig = PixDistribution(image) kde = skneighbor.KernelDensity(kernel=kernel, bandwidth=bandwidth) if len(image.shape) > 1: kde.fit(image.flatten()[:, np.newaxis]) else: kde.fit(image[:, np.newaxis]) x_pos = np.linspace(intens[0], intens[1], num=num)[:, np.newaxis] kde.get_params() log_dens = kde.score_samples(x_pos) dens = np.exp(log_dens) maxima = LocalMaxima(dens, width=100, highPeak=False) if len(maxima) != 2: print('Non-bimodal detected') return None else: m1, m2 = maxima thres = 0.5 * (x_pos[m1, 0] + x_pos[m2, 0]) _ax.plot([thres, thres], [0, _max_count], label='Ostu') plt.show() return thres
def getDistanceDensity(self, data_set): self.distance = [] kde = neighbors.KernelDensity(kernel = 'linear',bandwidth=0.75).fit(data_set) for i in range(0,len(data_set)): density = kde.score_samples(data_set[i]) self.distance.append(density)
def GetOptimalBandwidth(self, datalabel, bandlims, numbands): '''Optimize the bandwidth using leave-one-out cross-validation. Example follows that at jakevdp.github.io/PythonDataScienceHandbook. Args datalabel: string string describing which datalabel in the dataframe to find the bandwidth for bandlims: array (length 2) limits to search for the optimal bandwidth in numbands: int ints ''' if bandlims[1] < 0 or bandlims[0] < 0: print("Bandwidth must be greater than zero") return bandwidths = np.linspace(bandlims[0], bandlims[1], numbands) data = self.df[datalabel] if isinstance(self.df[datalabel][0], np.ndarray): print("WERE IN HERE") data_arr = [] for i in xrange(len(self.df[datalabel])): data_arr = data_arr + list(self.df[datalabel][0]) data = np.array(data_arr) if len(data) > 500: print("This may take some time depending on your data length.") print("numbands > 10 with len(data)>500 starts to take a bit") grid = sgs.GridSearchCV(skn.KernelDensity(kernel='gaussian'), {'bandwidth': bandwidths}, cv=cv.LeaveOneOut(len(data))) grid.fit(data[:, None]) thebandwidth = grid.best_params_['bandwidth'] return thebandwidth
def __init__(self, masks): n_class = 10 self.maps_with_class = [[], [], [], [], [], [], [], [], [], []] self.kde_samplers = [] self.class_probs = np.ones(n_class) / n_class # self.class_probs = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0.5, 0.5]) self.mask_size = None ts = time.time() for mask_i, mask in enumerate(masks): assert mask.shape[2] == n_class if not self.mask_size: self.mask_size = mask.shape[1] samplers = [] for class_i in range(n_class): X = np.nonzero(mask[:, :, class_i]) X = np.stack(X, axis=1) # np.random.shuffle(X) # X = X[:50000] if not X.size: samplers.append(None) else: self.maps_with_class[class_i].append(mask_i) sampler = neighbors.KernelDensity(self.mask_size * 0.02).fit(X) samplers.append(sampler) assert len(samplers) == n_class self.kde_samplers.append(samplers) print('sampler init time: {}'.format(time.time() - ts))
def KDEFit(data, bandwidth='thumb'): if bandwidth == 'thumb': #bandwidth = np.power( (len(data)*(data.shape[-1]+2.0)/4.0), -1.0/(data.shape[-1]+4.0) ) bandwidth = np.power((len(data) * (data.shape[-1] + 2.0) / 4.0), -1.0 / (data.shape[-1] + 4.0)) * 0.5 kde = neighbors.KernelDensity(bandwidth=bandwidth).fit(data) return kde
def plot_and_save(scores_and_labels, xlabel, output, xmin=0.0, xmax=0.6, smoothness=20., font_size=12, line_width=2): # create and save plot plt.figure() # create kernel density estimator kde = neighbors.KernelDensity(kernel='gaussian', bandwidth=xmax / smoothness) # need to add another dimension as required by sklearn # arrays passed to kde must be 2-dimensional X_plot = np.reshape(np.linspace(xmin, xmax, 500), (-1, 1)) styles = ['-', '--', '-.', ':'] for i, (xs, label) in enumerate(scores_and_labels): scores = np.ravel(xs) if len(xs) < 1e5 else np.random.choice( np.ravel(xs), int(1e5)) kde.fit(np.reshape(scores, (-1, 1))) densities = kde.score_samples(X_plot) plt.plot(X_plot[:, 0], np.exp(densities), lw=line_width, label=label, ls=styles[i % len(styles)]) plt.ylabel('Density', size=font_size) plt.xlabel(xlabel, size=font_size) plt.legend(loc='best', fontsize=font_size) plt.tight_layout() plt.savefig(output)
def fit_best_kde(data, steps=25, rtol=0.1, cv=3, fit_sample_size=None): ''' This function determines a best fitting kernel density estimate using scikit-learn's sklearn.neighbors.KernelDensity method along scikit-learn's sklearn.model_selection.GridSearchCV method. In particular, the GridSearchCV method is used to try all possible kernel types with 100 evenly spaced bandwidths between the minimum and maximum differences between values in the provided data. Arguments: data: a 1-dimensional list or Numpy array that includes the data rtol: the relative tolerance passed to sklearn.neighbors.KernelDensity method. Higher values offer faster computational times at the cost of accuracy. cv: the number of cross-validation splits the sklearn.model_selection.GridSearchCV method uses to identify the best kde. fit_sample_size: a value that, if specified, denotes that a random sample of size sample_size should be used to fit the kernel density estimate. This functionality is added to reduce the high computational times that may occur when the provided data is large. Returns: data: a dictionary specifes the best bandwidth and kernel. ''' import sklearn.neighbors as skneighbor from sklearn.model_selection import GridSearchCV import warnings import numpy as np data = np.array(data) with warnings.catch_warnings(): warnings.filterwarnings('ignore') if fit_sample_size is not None: data = np.random.choice(data.ravel(), size=fit_sample_size, replace=False) min_val, max_val = find_min_max_diff(data) params = { 'bandwidth': np.linspace(min_val, max_val, steps), 'kernel': skneighbor.kde.VALID_KERNELS } grid = GridSearchCV(skneighbor.KernelDensity(rtol=rtol), params, cv=cv) grid.fit(data.reshape(-1, 1)) return grid.best_params_
def fit(self, X, y, weights=[1, 1]): self.classes_ = np.sort(np.unique(y)) training_sets = [X[y == yi] for yi in self.classes_] self.models_ = [ neighbors.KernelDensity(bandwidth=self.bandwidth, kernel=self.kernel).fit(Xi) for Xi in training_sets ] weights = np.array(weights) self.logpriors_ = [ np.log(Xi.shape[0] / X.shape[0]) for Xi in training_sets ] + np.log(weights)
def fit(self, X, y): self.ages_ = y degree = 4 ages_t = y.reshape(-1, 1) wm_model = sklearn.pipeline.make_pipeline( sklearn.preprocessing.PolynomialFeatures(degree), sklearn.linear_model.LinearRegression()) gm_model = sklearn.pipeline.make_pipeline( sklearn.preprocessing.PolynomialFeatures(1), sklearn.linear_model.LinearRegression()) self.wm_model_ = wm_model.fit(ages_t, X[:, 0]) self.gm_model_ = gm_model.fit(ages_t, X[:, 1]) self.ages_ = y self.ages_grid_ = np.arange(15, 100).reshape(-1, 1) ages_kde = skn.KernelDensity(kernel="gaussian", bandwidth=3) ages_kde.fit(ages_t) prior = np.exp( ages_kde.score_samples(self.ages_grid_)) self.prior_ = prior / np.sum(prior) wm_residuals = np.abs(wm_model.predict(ages_t) - X[:, 0]) gm_kernel = ( 44.7 ** 2 * skg.kernels.RBF(length_scale=30, length_scale_bounds=(10, 60)) + skg.kernels.WhiteKernel(noise_level=1e4, noise_level_bounds=(1e3, 1e5)) ) wm_kernel = ( 44.7 ** 2 * skg.kernels.RBF(length_scale=30, length_scale_bounds=(10, 60)) + skg.kernels.WhiteKernel(noise_level=1e4, noise_level_bounds=(1e3, 1e5)) ) self.wm_gp_ = skg.GaussianProcessRegressor( kernel=wm_kernel, n_restarts_optimizer=0) self.wm_gp_.fit(ages_t, wm_residuals) gm_residuals = np.abs(gm_model.predict(ages_t) - X[:, 1]) self.gm_gp_ = skg.GaussianProcessRegressor( kernel=gm_kernel, n_restarts_optimizer=0) self.gm_gp_.fit(ages_t, gm_residuals) # plt.figure() # plt.scatter(y, X[:, 0]) # plt.plot(self.ages_grid_, wm_model.predict(self.ages_grid_)) # plt.figure() # plt.scatter(y, gm_residuals) # plt.plot(self.ages_grid_, self.gm_gp_.predict(self.ages_grid_)) # plt.show() # input() return self
def pdf_from_kde(data, min_val=0, max_val=None, bandwidth=1.0, kernel='gaussian'): ''' This function generates a probability density function (PDF) that is based on a kernel density estimate that is fit using scikit-learn's sklearn.neighbors.KernelDensity method. Specifically, it returns two objects, pdfx and pdfy, that contain the support and probability values that define the PDF, respectively. Arguments: data: a 1-dimensional list or Numpy array that includes the data min_val: the minimum value to include in the PDF support (default is min_value - 0.10*[range between max_val and min_val values]) max_val: the maximum value to include in the PDF support (default is max_value + 0.10*[range between max_val and min_val values]) bandwidth: the bandwidth for the kernel density estimate. cv: the kernel type, which is passed directly to scikit-learn's sklearn.neighbors.KernelDensity method Returns: data: a dictionary with two keys, x and y. The values are NumPy arrays for the support (x) and probability values (y) that define the PDF. ''' import sklearn.neighbors as skneighbor import numpy as np data = np.array(data) if min_val is None: min_val = data.min() - 0.10 * (data.max() - data.min()) if max_val is None: max_val = data.max() + 0.10 * (data.max() - data.min()) pdfx = np.linspace(min_val, max_val, 1000) pdfy = np.exp( skneighbor.KernelDensity(bandwidth=bandwidth, kernel=kernel, rtol=0.1).fit(data.reshape( -1, 1)).score_samples(pdfx.reshape(-1, 1))) pdfy = pdfy / pdfy.sum() return {'x': pdfx, 'y': pdfy}
def fit(self, x_target=np.random.random((100, 3)), y_target=np.random.binomial(1, 0.5, (100, ))): ''' __init__: To store all the data in a class param x_source: numpy array (n,d) of features in source distribution y_source: numpy array (n,) of labels in source distribution x_target: numpy array (n,d) of features in target distribution y_target: numpy array (n,) of labels in target distribution Stores the class variables m: # source data points n: # target data points d: # feature dimension x_source, y_source, x_target, y_target ''' x_target = np.array(x_target) y_target = np.array(y_target) # Checking shape consistency if len(x_target.shape) != 2: raise TypeError('x_target is not an array fo shape (n,d)') if len(y_target.shape) != 1: raise TypeError('y_target is not an array fo shape (n,)') # Checking dimension consistency if self.x_source.shape[1] != x_target.shape[1]: raise TypeError( 'Dimension don\'t match for source and target features') m, self.d = self.x_source.shape self.n, _ = x_target.shape self.n += m x, y = np.concatenate((self.x_source, x_target)), np.concatenate( (self.y_source, y_target)) self.prop_target = np.mean(y_target) weights = np.array([1 - self.prop_target, self.prop_target]) self.logpriors_ = np.log(weights) self.classes = np.array([0, 1]) training_sets = [x[y == i] for i in [0, 1]] self.models_ = [ neighbors.KernelDensity(bandwidth=self.bandwidth, kernel=self.kernel).fit(xi) for xi in training_sets ]
def KDEEstimate2D(self,bandwidth,datalabelx,datalabely,xbins=100j,ybins=100j, x_range=[0,1], y_range=[0,1],kern='gaussian'): ''' Performs a 2D Kernel density estimation using data from the two variables specified in datalabelx and datalabely. x and y-ranges assume data has been normalized and have the full range from [0,1]. ''' datax = None datay = None try: datax = self.df[datalabelx] datay = self.df[datalabely] except KeyError: print("No data found for one of these datalabels.") return range_x_ind = np.where((datax>x_range[0]) & (datax<x_range[1]))[0] range_y_ind = np.where((datay>y_range[0]) & (datay<y_range[1]))[0] print("RANGE_X: " + str(range_x_ind)) print("RANGE_Y_IND: " + str(range_y_ind)) range_indices = np.array(list(set(np.concatenate((range_x_ind,range_y_ind))))) print("RANGE_INDICES: " + str(range_indices)) datax = datax[range_indices] datay = datay[range_indices] if isinstance(self.df[datalabelx][0],np.ndarray): print("WERE IN HERE") datax_arr = [] for i in range(len(self.df[datalabelx])): datax_arr = datax_arr + list(self.df[datalabelx][0]) datax=np.array(datax_arr) if isinstance(self.df[datalabely][0],np.ndarray): print("WERE IN HERE") datay_arr = [] for i in yrange(len(self.df[datalabely])): datay_arr = datay_arr + list(self.df[datalabely][0]) datay=np.array(datay_arr) xx, yy = np.mgrid[x_range[0]:x_range[1]:xbins, y_range[0]:y_range[1]:ybins] xy_grid = np.vstack([yy.ravel(),xx.ravel()]).T xy_dataset = np.vstack([datay,datax]).T TwoDKDE = skn.KernelDensity(bandwidth=bandwidth,kernel=kern) TwoDKDE.fit(xy_dataset) z = np.exp(TwoDKDE.score_samples(xy_grid)) return xx,yy,np.reshape(z,xx.shape)
def kde_pdf(spike_times, bandwidth=50.0, xgrid=None, kernel='gaussian'): """Compute the probability density function using KernelDensity estimation with specified bandwidth and evaluated at positions in xgrid. Return (pdf, xgrid) """ if len(spike_times) == 0: warnings.warn('No spikes in spike trains') return (None, None) kde = skn.KernelDensity(kernel=kernel, bandwidth=bandwidth) kde.fit(spike_times[:, np.newaxis]) if xgrid is None: xgrid = np.arange(min(spike_times), max(spike_times), bandwidth/2.0) log_pdf = kde.score_samples(xgrid[:, np.newaxis]) pdf = np.exp(log_pdf) return pdf, xgrid
def selectWithKernalDensity(dist_top): """ Model selection rountine that returns a list of models based on the output of kernal density estimation. :param dist_top: list of sorted distances """ dist_top_reshape = dist_top.reshape((len(dist_top), 1)) kde = neighbors.KernelDensity(kernel='tophat', bandwidth=0.005).fit(dist_top_reshape) log_dens = kde.score_samples(dist_top_reshape) minInd = signal.argrelextrema(log_dens, np.less) return minInd, log_dens
def _kernel_density_joint(estimations, ranges, bandwidth=1 / 25): ndims = len(ranges) scaler = _min_max_scaler(ranges, feature_range=(0, 100)) bandwidth = bandwidth * 100 # step = 1.0 kd = neighbors.KernelDensity(bandwidth=bandwidth).fit( scaler.transform(estimations)) locations1d = np.arange(0, 100, 1) locations = np.reshape(np.meshgrid(*[locations1d] * ndims), (ndims, -1)).T kd_probs = np.exp(kd.score_samples(locations)) shape = (ndims, ) + (len(locations1d), ) * ndims locations = scaler.inverse_transform(locations) locations = np.reshape(locations.T, shape) kd_probs = np.reshape(kd_probs, shape[1:]) return locations, kd_probs, kd
def _kernel_density_joint(samples, weights, ranges, bandwidth=1 / 25): ndims = len(ranges) scaler = _min_max_scaler(ranges, feature_range=(0, 100)) bandwidth = bandwidth * 100 # step = 1.0 kd = neighbors.KernelDensity(bandwidth=bandwidth) kd.fit(scaler.transform(samples), sample_weight=weights) grid_shape = [100] * ndims grid = np.indices(grid_shape) locations = np.reshape(grid, (ndims, -1)).T kd_probs = np.exp(kd.score_samples(locations)) shape = (ndims, *grid_shape) locations = scaler.inverse_transform(locations) locations = np.reshape(locations.T, shape) kd_probs = np.reshape(kd_probs, grid_shape) return locations, kd_probs, kd
def KDEEstimate2D(self, bandwidth, datalabelx, datalabely, xbins=100j, ybins=100j, kern='gaussian'): datax = None datay = None try: datax = self.df[datalabelx] datay = self.df[datalabely] except KeyError: print("No data found for one of these datalabels.") return if isinstance(self.df[datalabelx][0], np.ndarray): print("WERE IN HERE") datax_arr = [] for i in xrange(len(self.df[datalabelx])): datax_arr = datax_arr + list(self.df[datalabelx][0]) datax = np.array(datax_arr) if isinstance(self.df[datalabely][0], np.ndarray): print("WERE IN HERE") datay_arr = [] for i in yrange(len(self.df[datalabely])): datay_arr = datay_arr + list(self.df[datalabely][0]) datay = np.array(datay_arr) xx, yy = np.mgrid[datax.min():datax.max():xbins, datay.min():datay.max():ybins] xy_grid = np.vstack([yy.ravel(), xx.ravel()]).T xy_dataset = np.vstack([datay, datax]).T TwoDKDE = skn.KernelDensity(bandwidth=bandwidth, kernel=kern) TwoDKDE.fit(xy_dataset) z = np.exp(TwoDKDE.score_samples(xy_grid)) return xx, yy, np.reshape(z, xx.shape)
def KDEEstimate1D(self,bandwidth,datalabel,x_range=[0,1],bins=100,kern='gaussian'): ''' Performs a 1D Kernel density estimation using data from the two variables specified in datalabelx and datalabely. x-ranges assume data has been normalized and have the full range from [0,1]. ''' data = None try: data = self.df[datalabel] except KeyError: print("No data found for this datalabel.") return if isinstance(self.df[datalabel][0],np.ndarray): data = np.concatenate(self.df[datalabel]) #data_arr = [] #for i in range(len(self.df[datalabel])): # data_arr = data_arr + list(self.df[datalabel][0]) #data=np.array(data_arr) linspace = np.linspace(x_range[0],x_range[1], (x_range[1]-x_range[0])*bins) kde = skn.KernelDensity(bandwidth=bandwidth, kernel=kern) kde.fit(self.df[datalabel][:,None]) logp = kde.score_samples(linspace[:,None]) return linspace, np.exp(logp)
def KDEEstimate1D(self, datalabel, xlims=None, kern='gaussian'): bandwidth = None data = None try: bandwidth = self.bandwidths[datalabel] data = self.df[datalabel] except KeyError: print("No bandwidth or data found for this datalabel.") return print(data) if isinstance(self.df[datalabel][0], np.ndarray): print("WERE IN HERE") data_arr = [] for i in xrange(len(self.df[datalabel])): data_arr = data_arr + list(self.df[datalabel][0]) data = np.array(data_arr) if xlims is None: xlims = [np.min(data), np.max(data)] linspace = np.linspace(xlims[0], xlims[1], (xlims[1] - xlims[0]) * 100.) kde = skn.KernelDensity(bandwidth=bandwidth, kernel=kern) kde.fit(self.df[datalabel][:, None]) logp = kde.score_samples(linspace[:, None]) return linspace, np.exp(logp)
""" Make a 'true' integration over 2 dims of the KDE to obtain a marginalized 1D distribtuion (here in logE). Use multiprocessing, because it takes time on a single processor. """ import numpy as np import scipy.integrate as scint import sklearn.neighbors as skn from multiprocessing import Pool exp = np.load("./data/IC86_I_data.npy") kde = skn.KernelDensity(bandwidth=0.1, kernel="gaussian", rtol=1e-8) # KDE sample must be cut in sigma before fitting, similar to range in hist _exp = exp[exp["sigma"] <= np.deg2rad(5)] fac_logE = 1.5 fac_dec = 2.5 fac_sigma = 2. _logE = fac_logE * _exp["logE"] _sigma = fac_sigma * np.rad2deg(_exp["sigma"]) _dec = fac_dec * _exp["dec"] kde_sample = np.vstack((_logE, _dec, _sigma)).T # Fit KDE best model to sample kde.fit(kde_sample)
def findClusters(self, data_set, data_set_training=None): self.clusters = [] self.clusters_training = [] self.indices = [] self.distance = [] self.labels = [] kde = neighbors.KernelDensity(bandwidth=0.75).fit(data_set) print(data_set) if (data_set_training == None): data_set_training = data_set else: mergedlist = [] mergedlist.extend(data_set) mergedlist.extend(data_set_training) data_set_training = mergedlist #print("data_set_trainging:") #print(data_set_training) #print(len(data_set_training)) #print(len(data_set_training[0])) kmeans = MiniBatchKMeans(n_clusters=self.n_clusters, random_state=0).fit(data_set_training) #predicao = kmeans.predict(data_set_training) #print("Centros de cluster") #print(kmeans.cluster_centers_) #exit() distance_clusters = kmeans.fit_transform(data_set_training) #print(distance_clusters) #print(len(distance_clusters)) #print(len(distance_clusters[0])) #print(len(data_set)) #distance_clusters_data_set = distance_clusters[0:len(data_set)] #print(distance_clusters) #exit() #print("Preparando clusters...") for i in range(0, len(kmeans.cluster_centers_)): self.clusters.append([]) self.clusters_training.append([]) self.indices.append([]) for i in range(0, len(data_set_training)): self.clusters_training[kmeans.labels_[i]].append(data_set_training[i]) #print(len(data_set)) #print(len(distance_clusters)) #print(len(kmeans.labels_)) predicao = kmeans.predict(data_set_training) for i in range(0,len(data_set)): dist = 0 print("exemplo: ", i+1) for j in distance_clusters[i]: #print("Distancia para cluster: ", j ) dist+= j #print(j) #self.distance.append(dist) self.distance.append([dist]) '''print("dATASET: ", data_set[i]) print("data set training: " ,data_set_training[i]) print(kmeans.labels_) print(predicao[i]) print(dist) print("------------")''' self.indices[kmeans.labels_[i]].append(i) self.clusters[kmeans.labels_[i]].append(data_set[i]) #print(self.clusters) density = kde.score_samples(data_set[i]) self.distance[i].append(density[0]) #print(self.distance) #exit() return distance_clusters
def plotattractors(report, reduction, figsize=None, labelsize=None, connect_psets=False, contour=False, downsample=None, density_downsample=None, focus=None, focus_osc=False, hide_defocused=False, color_code=False, square=False): """ Set up a hexbin or scatter-line plot in the current pyplot. Arguments: - report: full parameter sampling report - reduction: how to map concentration values to 2D space: an instance of e.g. PCA2D or AverageLog - figsize: figure size as a tuple of inches (width by height) - labelsize: font size for axis labels - connect_psets: whether to make a scatter-line plot instead of a hexbin plot - contour: proportion of density outside the lowest contour level, or False to not add contour lines - downsample: ruleset to downsample systems for display - density_downsample: ruleset to downsample systems for contour/density estimation - focus: Boolean-valued ruleset to focus systems (scatter-line only, default all focused) - focus_osc: whether to focus systems containing oscillators (scatter-line only, will defocus all others if focus not set) - hide_defocused: whether to hide all non-focused systems (scatter-line only) - color_code: whether to color lines by system type (scatter-line only) - square: whether to force a square plot """ reduction.prepare(report) random.seed(1) summary_occurrences = categorizeattractors(report) filtered_psets = applydownsample(summary_occurrences, downsample) points = reduction.reduce(psets_matrix(filtered_psets)) xlabel, ylabel = reduction.labels() fig, ax_main = plt.subplots(figsize=figsize) if connect_psets: distinct_summaries = list(categorizeattractors(filtered_psets).keys()) default_cycle = cycler.cycler(color=[ 'tab:blue', 'tab:orange', 'tab:green', 'tab:red', 'tab:purple', 'tab:brown', 'tab:gray', 'tab:olive', 'tab:cyan' ]) default_cycler = default_cycle() defocus_default_cycler = plt.rcParams['axes.prop_cycle']() for i, pset in enumerate(filtered_psets): pset_matrix = np.array(caricatureattractors(pset['attractors'])) pset_xy = reduction.reduce(pset_matrix) sorted_attractors = pset_xy[pset_matrix[:, 0].argsort(), :] point_mask = [not isoscillator(a) for a in pset['attractors']] has_oscillator = not all(point_mask) z = i linewidth = None oscwidth = 1.6 dotsize = 36.0 defocused = False summary = summarizeattractors(pset) if focus or focus_osc: if (focus_osc and has_oscillator) or (focus and specificrulevalue( focus, summary, default=False)): z += len(filtered_psets) + 1 elif hide_defocused: continue else: linewidth = 0.8 oscwidth = 1.1 dotsize = 10.0 defocused = True if color_code: hue, sat, lum, hue_vary_width = summaryhsl( distinct_summaries, summary) hue += random.uniform(0, hue_vary_width) if not defocused: lum *= random.uniform(0.85, 1.1) sat *= random.uniform(0.8, 1.0) elif defocused: next_prop = next(defocus_default_cycler) color_spec = next_prop['color'] r, g, b = mplcolors.to_rgb(color_spec) hue, sat, lum = colorsys.rgb_to_hls(r, g, b) if defocused: lum = min(1 - (1 - lum) * random.uniform(0.3, 0.5), 0.9) sat *= random.uniform(0.35, 0.45) if color_code or defocused: pset_color = colorsys.hls_to_rgb(hue, lum, sat) else: pset_color = next(default_cycler)['color'] ax_main.plot(sorted_attractors[:, 0], sorted_attractors[:, 1], lw=linewidth, color=pset_color, zorder=z) pointprops = { 's': dotsize } if defocused or not contour else { 'linewidths': 1.0, 'edgecolors': 'white', 's': dotsize * 1.3 } ax_main.scatter(pset_xy[point_mask, 0], pset_xy[point_mask, 1], color=pset_color, zorder=z, **pointprops) for osc in (a for a in pset['attractors'] if isoscillator(a)): vertices = np.array(osc['orbit']) projected_vertices = reduction.reduce(vertices) if projected_vertices.shape[0] >= 3: projected_vertices = np.vstack( (projected_vertices, projected_vertices[0, :])) polygon = mplpatch.Polygon(projected_vertices, color=pset_color, linewidth=oscwidth, linestyle='--', fill=False, zorder=z) ax_main.add_patch(polygon) else: cmap = copy.copy(plt.get_cmap('viridis')) cmap.set_under('white', 1.0) hex_args = { 'linewidths': 0.2, 'norm': mplcolors.LogNorm(vmin=2), 'cmap': cmap, 'gridsize': 40 } bin_results = ax_main.hexbin(points[:, 0], points[:, 1], **hex_args) fig.colorbar(bin_results, ax=ax_main, label='Attractors') if contour: random.seed(1) density_filtered_psets = applydownsample(summary_occurrences, density_downsample) density_points = reduction.reduce(psets_matrix(density_filtered_psets)) kde = neighbors.KernelDensity(kernel='gaussian', bandwidth=0.1).fit(density_points) bin_x, bin_y = np.mgrid[(density_points[:, 0].min() - 0.15):(density_points[:, 0].max() + 0.15):80j, (density_points[:, 1].min() - 0.15):(density_points[:, 1].max() + 0.15):80j] density = np.exp( kde.score_samples(np.vstack((bin_x.flatten(), bin_y.flatten())).T)) sorted_densities = np.sort(density.flatten()) total_density = np.sum(sorted_densities) cdf = np.cumsum(sorted_densities) / total_density if connect_psets: cutoff_indices = [ np.where(cdf > percentile)[0][0] for percentile in np.linspace(contour, 1, 5)[:-1] ] levels = [sorted_densities[c] for c in cutoff_indices] + [total_density] colors = ['#c65ff560', '#af36e388', '#b300ff90', '#8500e2a0'] ax_main.contourf(bin_x, bin_y, density.reshape(bin_x.shape), levels, colors=colors, zorder=len(filtered_psets)) else: cutoff_indices = [ np.where(cdf > percentile)[0][0] for percentile in np.linspace(contour, 0.9, 6) ] levels = [sorted_densities[c] for c in cutoff_indices] widths = np.linspace(0.5, 1.4, 6) ax_main.contour(bin_x, bin_y, density.reshape(bin_x.shape), levels, linewidths=widths, colors='black', zorder=(len(filtered_psets) * 3), alpha=0.6) if square: ax_main.axis('square') elif reduction.equalscale(): ax_main.axis('equal') if reduction.zerobased('x'): ax_main.set_xlim(left=0) if reduction.zerobased('y'): ax_main.set_ylim(bottom=0) locator_base = reduction.locatorbase() if locator_base is not None: ax_main.xaxis.set_major_locator( mpltick.MultipleLocator(base=locator_base)) ax_main.yaxis.set_major_locator( mpltick.MultipleLocator(base=locator_base)) x_text = ax_main.set_xlabel(xlabel) if labelsize is not None: x_text.set_fontsize(labelsize) y_text = ax_main.set_ylabel(ylabel) if labelsize is not None: y_text.set_fontsize(labelsize)
print(f'collect trajs {time.time() - start:.0f}s', flush=True) if v['obj'] in ["emd"]: if v['critic']["reinitialize"] or itr == 0: critic = Critic(len(state_indices), **v['critic'], device=device) start = time.time() critic_loss = critic.learn(expert_samples.copy(), agent_emp_states, iter=v['critic']['iter']) print(f'train critic {time.time() - start:.0f}s', flush=True) # Initialize a density model using KDE elif v['density']['model'] == 'kde': agent_density = neighbors.KernelDensity( bandwidth=v['density']['kde']['bandwidth'], kernel=v['density']['kde']['kernel']) agent_density.fit(agent_emp_states) elif v['density']['model'] == "disc": start = time.time() # learn log density ratio disc = Disc(len(state_indices), **v['density']['disc'], device=device) disc_loss = disc.learn(expert_samples.copy(), agent_emp_states, iter=v['density']['disc']['iter']) print(f'train disc {time.time() - start:.0f}s', flush=True) old_reward = copy.deepcopy(reward_func)
def draw_fit_with_peaks(self, num_walkers, num_steps_to_include): num_dim = len(self.a_free_par_guesses) sPathToFile = self.s_directory_save_name + self.dict_filename if os.path.exists(sPathToFile): dSampler = pickle.load(open(sPathToFile, 'r')) l_chains = [] for sampler in dSampler[num_walkers]: l_chains.append(sampler['_chain']) a_sampler = np.concatenate(l_chains, axis=1) print 'Successfully loaded sampler!' else: print sPathToFile print 'Could not find file!' sys.exit() """ print 'Test: hard coding steps and walkers' num_steps_to_include = 1 num_walkers = 1 a_sampler = a_sampler[:num_walkers, -num_steps_to_include:, :].reshape((-1, num_dim)) #a_sampler = a_sampler[:, -num_steps_to_include:, :].reshape((-1, num_dim)) a_medians = np.median(a_sampler, axis=0) l_num_pe = [0, 1, 2, 3, 4, 5, 6] l_colors = ['r', 'b', 'g', 'c', 'y', 'm', 'brown'] prob_hit_first, mean_e_from_dynode, probability_electron_ionized, bkg_mean, bkg_std, mean_num_pe, scale_par = a_medians """ """ a_sampler = a_sampler[:, -num_steps_to_include:, :].reshape((-1, num_dim)) dd_hist, l_bins = np.histogramdd(a_sampler, bins=5) l_max_bins = np.unravel_index(dd_hist.argmax(), dd_hist.shape) l_bin_centers = [0 for i in xrange(len(l_max_bins))] # find bin centers from max for i in xrange(len(l_max_bins)): l_bin_centers[i] = (l_bins[i][l_max_bins[i]+1] + l_bins[i][l_max_bins[i]]) / 2. l_num_pe = [0, 1, 2, 3, 4, 5, 6] l_colors = ['r', 'b', 'g', 'c', 'y', 'm', 'brown'] prob_hit_first, mean_e_from_dynode, probability_electron_ionized, bkg_mean, bkg_std, mean_num_pe, scale_par = l_bin_centers """ max_num_events_for_kde = 5e4 assert num_steps_to_include * num_walkers < max_num_events_for_kde, 'Using KDE to estimate maximum in full space so must use less than %d events for time constraints.\n' % ( int(max_num_events_for_kde)) a_sampler = a_sampler[:, -num_steps_to_include:, :].reshape( (-1, num_dim)) scaler = preprocessing.StandardScaler() scaler.fit(a_sampler) a_scaled_samples = scaler.transform(a_sampler) #print a_sampler[:,1:3] #print a_scaled_samples # find the best fit bandwith since this allows us # to play with bias vs variance grid = grid_search.GridSearchCV( neighbors.KernelDensity(), {'bandwidth': np.linspace(0.01, 2., 20)}, cv=4, verbose=1, n_jobs=4) print '\nDetermining best bandwidth...\n' grid.fit(a_scaled_samples) #print grid.best_estimator_ kde = neighbors.KernelDensity(**grid.best_params_) kde.fit(a_scaled_samples) def func_for_minimizing_for_plot(a_parameters): a_scaled_parameters = scaler.transform(a_parameters) return -kde.score(a_scaled_parameters) #a_bounds = [(0.75, 1), (1, 25), (0, 1.0), (1e3, 1e5), (5e4, 8e5), (0.6, 3.), (0.2, 2)] a_bounds = [ np.percentile(a_sampler[:, i], [2, 98]) for i in xrange(num_dim) ] result = op.differential_evolution(func_for_minimizing_for_plot, a_bounds, disp=True, maxiter=100, tol=0.01, popsize=20, polish=True) print result.x l_num_pe = [0, 1, 2, 3, 4, 5, 6] l_colors = ['r', 'b', 'g', 'c', 'y', 'm', 'brown'] prob_hit_first, mean_e_from_dynode, width_e_from_dynode, probability_electron_ionized, bkg_mean, bkg_std, mean_num_pe, scale_par = result.x l_hists = [ np.zeros(len(self.d_fit_files['bin_centers_plots']), dtype=np.float32) for i in xrange(len(l_num_pe)) ] sum_hist = np.zeros(len(self.d_fit_files['bin_centers_plots']), dtype=np.float32) mean_num_pe = np.asarray(mean_num_pe, dtype=np.float32) num_trials = np.asarray(self.num_mc_events, dtype=np.int32) prob_hit_first = np.asarray(prob_hit_first, dtype=np.float32) mean_e_from_dynode = np.asarray(mean_e_from_dynode, dtype=np.float32) width_e_from_dynode = np.asarray(width_e_from_dynode, dtype=np.float32) probability_electron_ionized = np.asarray(probability_electron_ionized, dtype=np.float32) bkg_mean = np.asarray(bkg_mean, dtype=np.float32) bkg_std = np.asarray(bkg_std, dtype=np.float32) bin_edges = np.asarray(self.d_fit_files['bin_edges_plots'], dtype=np.float32) num_bins = np.asarray(len(bin_edges) - 1, dtype=np.int32) sum_of_hists = 0 for i, num_pe in enumerate(l_num_pe): current_hist = l_hists[i] num_trials = np.asarray( int(self.num_mc_events * scipy.stats.poisson.pmf(num_pe, mean_num_pe)), dtype=np.int32) num_pe = np.asarray(num_pe, dtype=np.int32) l_args_gpu = [ self.rng_states, drv.In(num_trials), drv.In(self.num_loops), drv.InOut(current_hist), drv.In(num_pe), drv.In(prob_hit_first), drv.In(mean_e_from_dynode), drv.In(width_e_from_dynode), drv.In(probability_electron_ionized), drv.In(bkg_mean), drv.In(bkg_std), drv.In(num_bins), drv.In(bin_edges) ] gpu_fixed_pe_cascade_spectrum(*l_args_gpu, **self.d_gpu_scale) sum_of_hists += np.sum(current_hist) l_hists[i] = current_hist for i, num_pe in enumerate(l_num_pe): current_hist = l_hists[i] current_hist = np.asarray(current_hist, dtype=np.float32) * np.sum( self.d_fit_files['hist'] ) / sum_of_hists * self.d_fit_files[ 'bin_width'] / self.d_fit_files['bin_width_plots'] * scale_par sum_hist += current_hist l_hists[i] = current_hist f1, (ax1) = plt.subplots(1) ax1.set_yscale('log', nonposx='clip') a_x_values, a_y_values, a_x_err_low, a_x_err_high, a_y_err_low, a_y_err_high = neriX_analysis.prepare_hist_arrays_for_plotting( self.d_fit_files['hist'], self.d_fit_files['bin_edges']) ax1.errorbar(a_x_values, a_y_values, xerr=[a_x_err_low, a_x_err_high], yerr=[a_y_err_low, a_y_err_high], color='k', fmt='.') for i in xrange(len(l_num_pe)): ax1.plot(self.d_fit_files['bin_centers_plots'], l_hists[i], color=l_colors[i]) ax1.plot(self.d_fit_files['bin_centers_plots'], sum_hist, color='darkorange', linestyle='-') ax1.set_title('Integrated Charge Spectrum - %s' % (self.file_identifier)) ax1.set_xlabel(r'Integrated Charge [$e^{-}$]') ax1.set_ylabel('Counts') # test """ num_bins_plots = len(self.d_fit_files['bin_centers_plots']) a_hist = np.zeros(num_bins_plots, dtype=np.float32) a_hist_pure = np.zeros(num_bins_plots, dtype=np.float32) l_args_gpu = [self.rng_states, drv.In(num_trials), drv.In(self.num_loops), drv.InOut(a_hist), drv.In(mean_num_pe), drv.In(prob_hit_first), drv.In(mean_e_from_dynode), drv.In(probability_electron_ionized), drv.In(bkg_mean), drv.In(bkg_std), drv.In(num_bins), drv.In(bin_edges)] #start_time_mpe1 = time.time() gpu_cascade_model(*l_args_gpu, **self.d_gpu_scale) #print 'Time for MPE1 call: %f s' % (time.time() - start_time_spe) a_model = np.asarray(a_hist, dtype=np.float32)*np.sum(self.d_fit_files['hist'])/np.sum(a_hist)*self.d_fit_files['bin_width']/self.d_fit_files['bin_width_plots']*scale_par ax1.plot(self.d_fit_files['bin_centers_plots'], a_model, color='pink', linestyle='--') """ f1.savefig('%s%s_pe_specs_%s.png' % (self.s_directory_save_plots_name, self.s_base_save_name, self.file_identifier))
fac_dec = 2.5 fac_sigma = 2. # ########################################################################### logE = fac_logE * exp["logE"] sigma = fac_sigma * np.rad2deg(exp["sigma"]) dec = fac_dec * exp["dec"] sample = np.vstack(( fac_logE * exp["logE"], fac_dec * exp["dec"], # Normal space to have no hard cuts at the edges fac_sigma * np.rad2deg(exp["sigma"]) # In deg to match scale )).T # Optimize bandwidth in a cross validation. kde_estimator = skn.KernelDensity(kernel="gaussian", rtol=1e-6) # Scan grid. See comment on top on parameter ranges SCAN = "followup_2nd_pass" start = 0.1 step = 0.001 stop = 0.12 + step bandwidths = np.arange(start, stop, step) ncv = 20 param_grid = {"bandwidth": bandwidths} model_selector = skms.GridSearchCV( estimator=kde_estimator, cv=ncv, param_grid=param_grid,
n_event = len(fnames) job_list = allocate_jobs(n_event, n_procs, rank) for i in job_list: # read samples samples = np.genfromtxt(os.path.join(data_dir, \ fnames[i])) d_l_samples.append(samples) pkl_fname = os.path.join(data_dir, \ fnames[i].replace('.txt', '.pkl')) if recompute: # fit KDE to samples and pickle for later print 'fitting ' + fnames[i] gs = skms.GridSearchCV(skn.KernelDensity(), \ {'bandwidth': bw_grid}) gs.fit(samples[:, None]) kde = gs.best_estimator_ pickle.dump(kde, open(pkl_fname, 'wb')) print 'optimal bw: {:9.3e}'.format(kde.bandwidth) # plot fits hist, bin_edges = np.histogram(samples, bins=50, \ density=True) bin_centres = (bin_edges[1:] + bin_edges[:-1]) / 2.0 pdf = np.exp(kde.score_samples(d_l_grid[:, None])) if rank == 0: axes[0].plot(bin_centres, hist) axes[1].plot(d_l_grid, pdf)
def hkcmfind(self, magname='Kmag', dmagname='eKmag', niter=1000, kernel='epanechnikov'): # Set the data to find the TRGB mk = self.data.hmag.dropna() - self.data.kmag.dropna() dmk = np.sqrt(self.data.kerr.dropna()**2 + self.data.herr.dropna()**2) mk = -mk #Initialise stuff niter = niter # Number of itterations rtol = 1e-5 # Relative tolerance of the result kernel = 'epanechnikov' # Parabolic kernel for the KDE mx = np.linspace(max(mk) * 1.2, min(mk) * 0.8, 1000) trgbloc = np.zeros(niter) #---------------------------------------- #Generate NITER realisations of the Kernel Density Estimation for i in range(niter): msamp = np.random.normal( mk, dmk ) # Add Noise to data -> diff. each loop -> more reliable TRGB # Find an ideal binwidth for the luminosity function # PS: Monte Carlo already smooths the distribution, so reduce the ideal binwidth a bit. bandwidth_factor = 0.25 bandwidth = bandwidth_factor * (np.std(msamp) * (len(msamp)**(-0.2))) #---------------------------------------- # Implement the Kernel density estimation using a KD Tree for efficient queries kde = neighbors.KernelDensity(bandwidth=bandwidth, rtol=rtol, kernel=kernel) # Inialise kde.fit( msamp[:, np.newaxis]) # Fit the Kernel Density model on the data. #kde.score_samples #returns ln(pdf) # Evaluate the density model on data - probablility density function pdf = np.exp( kde.score_samples(mx[:, np.newaxis]) ) #MX is x-axis range which the PDF is computed/plotted. plt.plot(mx, pdf) #---------------------------------------- # Set the Edge Detection part using a savgol_filter smooth_window = 31 poly_degree = 3 dpdf = savgol_filter(pdf, smooth_window, poly_degree, deriv=1) trgbloc[i] = mx[np.argmin( dpdf )] # Most negative value corresponds to highest rate of decrease trgbloc_mean = np.mean(trgbloc) # Find the TRGB trgbloc_sd = np.std(trgbloc) # Find the Error in the TRGB estimate return [trgbloc_mean, trgbloc_sd]