def predict(self, X_in, y_in, X_te_in): X_tr = X_in.copy() Y_tr = y_in.copy() X_oos = X_te_in.copy() unique_class_vals = Y_tr.unique() best_params = self.best_params gmm_dict = {class_val: 0 for class_val in unique_class_vals} for class_val in unique_class_vals: gmm = BayesianGaussianMixture( n_components=best_params[class_val]['n_components'], covariance_type=best_params[class_val]['covariance_type'], weight_concentration_prior=best_params[class_val] ['weight_concentration_prior'], reg_covar=1).fit(X_tr[Y_tr == class_val], Y_tr[Y_tr == class_val]) gmm_dict[class_val] = np.exp(gmm.score_samples(X_oos)) gmm_df = pd.DataFrame.from_dict(gmm_dict) res_df = gmm_df.idxmax(axis=1) self.y_pred = res_df.values return res_df.values
def run(self): args = self.args uniblock_path = self._get_uniblock_path() feature = load(os.path.join(uniblock_path, 'feature.dump')) X = feature.get_feature_matrix(args.corpus_path) legal, mask = self._infer_nonzero(X) dump(legal, os.path.join(uniblock_path, 'legal.dump')) dump(mask, os.path.join(uniblock_path, 'mask.dump')) X = X[:, legal] bgm = BayesianGaussianMixture( n_components=args.k, covariance_type=args.cov, max_iter=200, random_state=0, verbose=0 if not args.verbose else 2, verbose_interval=1, tol=args.tol, n_init=args.n_init, init_params=args.init_params, ) bgm.fit(X) dump(bgm, os.path.join(uniblock_path, 'bgm.dump')) scores = bgm.score_samples(X) self._log_scores(scores, args.corpus_path) self._log_stats(scores, args.corpus_path)
def __init__(self, **kwargs): super().__init__(data_set=kwargs.pop('data_set', None), **kwargs) self.clf_ = kwargs.get('clf', None) if self.clf_ is None: raise ValueError("missing required keyword-only argument 'clf'") if not callable(getattr(self.clf_, 'fit', None)) or not callable( (getattr(self.clf_, 'predict_proba', None))): raise TypeError( "'clf' must be an instance with the methods 'fit' and 'predict_proba'" ) n_components = int( kwargs.pop('n_components', np.min([20, len(self.data_set_)]))) if n_components < 0 or n_components > len(self.data_set_): raise ValueError( "'n_components' must be an integer in the interval [1, n_samples]" ) # fit Gaussian mixture model for pre-clustering gmm = BayesianGaussianMixture(n_components=n_components, covariance_type='spherical', max_iter=1000, random_state=self.random_state_) gmm.fit(self.data_set_.X_) self.y_cluster_ = gmm.predict(self.data_set_.X_) self.p_x_ = np.exp(gmm.score_samples(self.data_set_.X_))
def train_gmm(self, samples): co_type = 'tied' gmm = BayesianGaussianMixture(n_components=2, covariance_type=co_type, n_init=10, random_state=0, max_iter=500, verbose=1) samples = cv2.cvtColor(samples.reshape(1, -1, 3), cv2.COLOR_BGR2YCrCb) samples = samples.reshape(-1, 3) gmm.fit(samples) max_prob = np.max(gmm.score_samples(samples)) return gmm, max_prob
def bayesianGMM(points, allPoints, numComponents): if len(allPoints) < numComponents: numComponents = len(allPoints) clf = BayesianGaussianMixture( weight_concentration_prior_type="dirichlet_process", weight_concentration_prior=100000, n_components=2 * numComponents, reg_covar=0, init_params='random', max_iter=1500, mean_precision_prior=.8, random_state=2) clf.fit(allPoints) return np.exp(clf.score_samples(points))
def fit(self, X, y): """ y must be composed of 0 and 1 """ self.gmms_ = {0: [], 1: []} ll_list = [] for n in self.n_clusters_list: gmm0 = BayesianGaussianMixture(n_components=n, covariance_type='full', random_state=self.random_state).fit( X[y == 0]) gmm1 = BayesianGaussianMixture(n_components=n, covariance_type='full', random_state=self.random_state).fit( X[y == 1]) self.gmms_[0].append(gmm0) self.gmms_[1].append(gmm1) ll = gmm1.score_samples(X) - gmm0.score_samples(X) ll_list.append(ll) ll_arr = np.stack(ll_list, axis=1) self.comb_ = LogisticRegression(solver='lbfgs').fit(ll_arr, y) return self
def denoise(times, waveforms): threshold = np.log(0.001) pcaed = PCA(n_components=2).fit_transform(waveforms) pcaed = scipy.stats.zscore(pcaed, axis=0) mix = BayesianGaussianMixture(n_components=2).fit(pcaed) logprob = mix.score_samples(pcaed) times = times[logprob > threshold] waveforms = waveforms[logprob > threshold] denoised = denoising_sort(times, waveforms) # denoised = denoised.select([isi(n) < 0.05 for n in denoised.nodes]) denoised = cluster_step(denoised, dpoints=200, n_components=10, min_cluster_size=3, mode="kmeans") # denoised = denoised.select([isi(n) < 0.05 for n in denoised.nodes]) return denoised
def embed_mixture_variational(xmaps_np, n_components, ): sample_by_feature = np.vstack([np_map.flatten() for dtag, np_map in xmaps_np.items() ] ) # mixture = BayesianGaussianMixture() begin = time.time() pca = PCA(n_components=50) sample_by_feature_pca = pca.fit_transform(sample_by_feature) print("shape is: {}".format(sample_by_feature_pca.shape)) mixture = BayesianGaussianMixture(n_components, covariance_type="spherical", verbose=10, verbose_interval=2, ) mixture.fit(sample_by_feature_pca) finish = time.time() print(mixture) print("Finished in {}".format(finish - begin)) print(mixture.predict(sample_by_feature_pca)) print(mixture.weights_) clusters = mixture.predict(sample_by_feature_pca) probabilities = mixture.score_samples(sample_by_feature_pca) return mixture, pca, clusters, probabilities
def gmm_entropy(points, n_est=None, n_components=None): #from sklearn.mixture import GaussianMixture as GMM from sklearn.mixture import BayesianGaussianMixture as GMM n, d = points.shape # Default to the full set if n_est is None: n_est = n # reduce size of draw to n_est if n_est >= n: x = points else: x = points[permutation(n)[:n_est]] n = n_est if n_components is None: n_components = int(5*sqrt(d)) ## Standardization doesn't seem to help ## Note: sigma may be zero #x, mu, sigma = standardize(x) # if standardized predictor = GMM(n_components=n_components, covariance_type='full', #verbose=True, max_iter=1000) predictor.fit(x) eval_x, _ = predictor.sample(n_est) weight_x = predictor.score_samples(eval_x) H = -np.mean(weight_x) #with np.errstate(divide='ignore'): H = H + np.sum(np.log(sigma)) # if standardized dH = 0. ## cross-check against own calcs #alt = GaussianMixture(predictor.weights_, mu=predictor.means_, sigma=predictor.covariances_) #print("alt", H, alt.entropy()) #print(np.vstack((weight_x[:10], alt.logpdf(eval_x[:10]))).T) return H / LN2, dH / LN2
class BayesianGMMOutlierDetector(OutlierMixin, BaseEstimator): """ The GMMDetector trains a Bayesian Gaussian Mixture Model on a dataset X. Once a density is trained we can evaluate the likelihood scores to see if it is deemed likely. By giving a threshold this model might then label outliers if their likelihood score is too low. :param threshold: the limit at which the model thinks an outlier appears, must be between (0, 1) :param method: the method that the threshold will be applied to, possible values = [stddev, default=quantile] If you select method="quantile" then the threshold value represents the quantile value to start calling something an outlier. If you select method="stddev" then the threshold value represents the numbers of standard deviations before calling something an outlier. There are other settings too, these are best described in the BayesianGaussianMixture documentation found here: https://scikit-learn.org/stable/modules/generated/sklearn.mixture.BayesianGaussianMixture.html. """ def __init__( self, threshold=0.99, method="quantile", n_components=1, covariance_type="full", tol=0.001, reg_covar=1e-06, max_iter=100, n_init=1, init_params="kmeans", weight_concentration_prior_type="dirichlet_process", weight_concentration_prior=None, mean_precision_prior=None, mean_prior=None, degrees_of_freedom_prior=None, covariance_prior=None, random_state=None, warm_start=False, verbose=0, verbose_interval=10, ): self.threshold = threshold self.method = method self.allowed_methods = ["quantile", "stddev"] self.n_components = n_components self.covariance_type = covariance_type self.tol = tol self.reg_covar = reg_covar self.max_iter = max_iter self.n_init = n_init self.init_params = init_params self.weight_concentration_prior_type = weight_concentration_prior_type self.weight_concentration_prior = weight_concentration_prior self.mean_precision_prior = mean_precision_prior self.mean_prior = mean_prior self.degrees_of_freedom_prior = degrees_of_freedom_prior self.covariance_prior = covariance_prior self.random_state = random_state self.warm_start = warm_start self.verbose = verbose self.verbose_interval = verbose_interval def fit(self, X: np.array, y=None) -> "BayesianGMMOutlierDetector": """ Fit the model using X, y as training data. :param X: array-like, shape=(n_columns, n_samples,) training data. :param y: ignored but kept in for pipeline support :return: Returns an instance of self. """ # GMM sometimes throws an error if you don't do this X = check_array(X, estimator=self, dtype=FLOAT_DTYPES) if len(X.shape) == 1: X = np.expand_dims(X, 1) if (self.method == "quantile") and ((self.threshold > 1) or (self.threshold < 0)): raise ValueError( f"Threshold {self.threshold} with method {self.method} needs to be 0 < threshold < 1" ) if (self.method == "stddev") and (self.threshold < 0): raise ValueError( f"Threshold {self.threshold} with method {self.method} needs to be 0 < threshold " ) if self.method not in self.allowed_methods: raise ValueError( f"Method not recognised. Method must be in {self.allowed_methods}" ) self.gmm_ = BayesianGaussianMixture( n_components=self.n_components, covariance_type=self.covariance_type, tol=self.tol, reg_covar=self.reg_covar, max_iter=self.max_iter, n_init=self.n_init, init_params=self.init_params, weight_concentration_prior_type=self. weight_concentration_prior_type, weight_concentration_prior=self.weight_concentration_prior, mean_precision_prior=self.mean_precision_prior, mean_prior=self.mean_prior, degrees_of_freedom_prior=self.degrees_of_freedom_prior, covariance_prior=self.covariance_prior, random_state=self.random_state, warm_start=self.warm_start, verbose=self.verbose, verbose_interval=self.verbose_interval, ) self.gmm_.fit(X) score_samples = self.gmm_.score_samples(X) if self.method == "quantile": self.likelihood_threshold_ = np.quantile(score_samples, 1 - self.threshold) if self.method == "stddev": density = gaussian_kde(score_samples) max_x_value = minimize_scalar(lambda x: -density(x)).x mean_likelihood = score_samples.mean() new_likelihoods = score_samples[score_samples < max_x_value] new_likelihoods_std = np.std(new_likelihoods - mean_likelihood) self.likelihood_threshold_ = mean_likelihood - ( self.threshold * new_likelihoods_std) return self def score_samples(self, X): X = check_array(X, estimator=self, dtype=FLOAT_DTYPES) check_is_fitted(self, ["gmm_", "likelihood_threshold_"]) if len(X.shape) == 1: X = np.expand_dims(X, 1) return self.gmm_.score_samples(X) * -1 def decision_function(self, X): # We subtract self.offset_ to make 0 be the threshold value for being an outlier: return self.score_samples(X) + self.likelihood_threshold_ def predict(self, X): """ Predict if a point is an outlier. :param X: array-like, shape=(n_columns, n_samples, ) training data. :return: array, shape=(n_samples,) the predicted data. 1 for inliers, -1 for outliers. """ predictions = (self.decision_function(X) >= 0).astype(np.int) predictions[predictions == 1] = -1 predictions[predictions == 0] = 1 return predictions
def execute(self, namespace): from sklearn.mixture import GaussianMixture, BayesianGaussianMixture from PYME.IO import MetaDataHandler points = namespace[self.input_points] X = np.stack([points['x'], points['y'], points['z']], axis=1) if self.mode == 'n': gmm = GaussianMixture(n_components=self.n, covariance_type=self.covariance, max_iter=self.max_iter, init_params=self.init_params) predictions = gmm.fit_predict(X) + 1 # PYME labeling scheme log_prob = gmm.score_samples(X) if not gmm.converged_: logger.error('GMM fitting did not converge') predictions = np.zeros(len(points), int) log_prob = -np.inf * np.ones(len(points)) elif self.mode == 'bic': n_components = range(1, self.n + 1) bic = np.zeros(len(n_components)) for ind in range(len(n_components)): gmm = GaussianMixture(n_components=n_components[ind], covariance_type=self.covariance, max_iter=self.max_iter, init_params=self.init_params) gmm.fit(X) bic[ind] = gmm.bic(X) logger.debug('%d BIC: %f' % (n_components[ind], bic[ind])) best = n_components[np.argmin(bic)] if best == self.n or (self.n > 10 and best > 0.9 * self.n): logger.warning( 'BIC optimization selected n components near n max') gmm = GaussianMixture(n_components=best, covariance_type=self.covariance, max_iter=self.max_iter, init_params=self.init_params) predictions = gmm.fit_predict(X) + 1 # PYME labeling scheme log_prob = gmm.score_samples(X) if not gmm.converged_: logger.error('GMM fitting did not converge') predictions = np.zeros(len(points), int) log_prob = -np.inf * np.ones(len(points)) elif self.mode == 'bayesian': bgm = BayesianGaussianMixture(n_components=self.n, covariance_type=self.covariance, max_iter=self.max_iter, init_params=self.init_params) predictions = bgm.fit_predict(X) + 1 # PYME labeling scheme log_prob = bgm.score_samples(X) if not bgm.converged_: logger.error('GMM fitting did not converge') predictions = np.zeros(len(points), int) log_prob = -np.inf * np.ones(len(points)) out = tabular.MappingFilter(points) try: out.mdh = MetaDataHandler.DictMDHandler(points.mdh) except AttributeError: pass out.addColumn(self.label_key, predictions) out.addColumn(self.label_key + '_log_prob', log_prob) avg_log_prob = np.empty_like(log_prob) for label in np.unique(predictions): mask = label == predictions avg_log_prob[mask] = np.mean(log_prob[mask]) out.addColumn(self.label_key + '_avg_log_prob', avg_log_prob) namespace[self.output_labeled] = out
class GeolocationInference(object): """ Geolocation Inference Model (GMM) """ def __init__(self, vocabulary, n_time_bins=10, time_norm=None, time_as_percentile=True, time_model_kwargs = {"Cs":10, "solver":'lbfgs', "n_jobs":-1}, mixture_kwargs = {"n_components":5,"covariance_type":"diag"}, random_state = 42): """ Geolocation inference model based on gaussian mixture model density estimates Args: vocabulary (object): Learned vocabulary n_time_bins (int): Number of temporal bins to use for training time_norm (str or None): How to normalize temporal features time_as_percentile (bool): If True, creates bins using percentile distribution over longitude instead of linear spacing. time_model_kwargs (dict): Arguments for logistic regression temporal estimator mixture_kwargs (dict): Parameters for the gaussian mixture model estimators random_state (int): Random seed for fitting estimators """ ## Class Attributes/Parameters self._vocabulary = vocabulary self._n_time_bins = n_time_bins self._time_norm = time_norm self._time_as_percentile = time_as_percentile self._time_model_kwargs = time_model_kwargs self._mixture_kwargs = mixture_kwargs self._random_state = random_state ## Check Arguments if "random_state" not in self._mixture_kwargs: self._mixture_kwargs["random_state"] = self._random_state if "random_state" not in self._time_model_kwargs: self._time_model_kwargs["random_state"] = self._random_state def __repr__(self): """ Human-readable string describing the class Args: None Returns: desc (str): Description of class """ return "GeolocationInference()" def _create_coordinate_grid(self, cell_size = 1): """ Create a grid of lon/lat points based on training boundaries Args: cell_size (float): Degrees contained within each grid cell Returns: coordinate_grid (array): Coordinate grid (land-mass only) """ ## Boundaries xmin = int(self._coord_bounds[0][0] - 1) xmax = int(self._coord_bounds[0][1] + 1) ymin = int(self._coord_bounds[1][0] - 1) ymax = int(self._coord_bounds[1][1] + 1) ## Coordinates lon_coord = [xmin] lat_coord = [ymin] while lon_coord[-1] < xmax: lon_coord.append(min(lon_coord[-1] + cell_size, xmax)) while lat_coord[-1] < ymax: lat_coord.append(min(lat_coord[-1] + cell_size, ymax)) coordinate_grid = [] for x in lon_coord: for y in lat_coord: if globe.is_land(y, x): coordinate_grid.append([x, y]) coordinate_grid = np.array(coordinate_grid) return coordinate_grid def _fit_mixture(self, i, X, y): """ Fit a single mixture model, update cache in place Args: i (int): Index of the feature being trained on X (csr matrix): Input feature matrix y (2d-array): Lon/Lat coordinates for training Returns: None """ ## Construct Training Sample S = [] nonzero = np.nonzero(X[:,i])[0] for n in nonzero: x_s = int(X[n, i]) y_s = y[n] S.extend([y_s] * x_s) S = np.vstack(S) ## Initialize Model n_components = min(self._mixture_kwargs["n_components"], len(nonzero)) args = self._mixture_kwargs.copy() args["n_components"] = n_components model = BayesianGaussianMixture(**args) ## Fit Model model = model.fit(S) ## Cache Model self._models[i] = model def fit(self, X, y): """ Args: X (csr matrix): Sparse feature matrix y (2d-array): Training coordinates Returns: self """ ## Coordinate Grid Boundaries self._coord_bounds = [[y[:,0].min(), y[:,0].max()], [y[:,1].min(), y[:,1].max()]] ## Initialize Mixture Model Cache m = len(self._vocabulary._feature_inds["text"]) + \ len(self._vocabulary._feature_inds["subreddit"]) self._models = [None for _ in range(m)] ## Feature Probabilities self._pu = np.array((X > 0).sum(axis=0) / X.shape[0])[0] ## Fit Prior self.prior = BayesianGaussianMixture(**self._mixture_kwargs) self.prior.fit(y) ## Fit Mixture Models for i in tqdm(range(m), desc="GMM Fit", total = m, file=sys.stdout): self._fit_mixture(i, X, y) ## Temporal Model if self._vocabulary._use_time: ## Isolate Time Data X_time = X[:, self._vocabulary._feature_inds["time"]] ## Normalize if self._time_norm is not None: X_time = normalize(X_time, self._time_norm, axis=1, copy=True) ## Create Time Bins y_lon = y[:,0] if self._time_as_percentile: percentiles = np.linspace(0, 100, self._n_time_bins + 1)[:-1] self._time_bins = np.percentile(y_lon, percentiles) else: lon_bounds = int(y_lon.min() - 1), int(y_lon.max() + 1) self._time_bins = np.linspace(lon_bounds[0], lon_bounds[1], self._n_time_bins + 1) y_lon = np.array(list(map(lambda v: assign_value_to_bin(v, self._time_bins), y_lon))) ## Train Model self._time_model_kwargs["scoring"] = f1_score self._time_model_kwargs["max_iter"] = 1000 self.time_classifier = LogisticRegressionCV(**self._time_model_kwargs) self.time_classifier.fit(X_time, y_lon) return self def predict_proba(self, X, coordinates = None): """ Args: X (csr matrix): Sparse feature matrix coordinates (2d-array): Coordinates to make predictions on. If None, creates grid around the world (land-masses only) Returns: coordinates (2d-array): Coordinate array associated with prediction columns P (2d-array): Predicted probabilities per coordinate """ ## Coordinate Grid if coordinates is None: coordinates = self._create_coordinate_grid() ## Transform X Shape X = X.toarray() ## Compute Prior for Elements With Missing prior = np.exp(self.prior.score_samples(coordinates)) ## Initialize Probability Array P = np.zeros((X.shape[0], len(coordinates))) ## Cycle Through Models for ind, model in tqdm(enumerate(self._models), total = len(self._models), file=sys.stdout, desc="GMM Posterior"): if model is None: continue u = X[:,[ind]] nonzero = np.nonzero(u)[0] if len(nonzero) == 0: continue p_c_u = np.exp(model.score_samples(coordinates)).reshape(1,-1) P[nonzero] += np.matmul(u[nonzero], p_c_u * self._pu[ind]) ## Default to Prior for Users Without Features P[np.all(P == 0, axis=1)] += prior ## Time Adjustment if self._vocabulary._use_time: ## Isolate Time Data X_time = X[:, self._vocabulary._feature_inds["time"]] ## Normalize if self._time_norm is not None: X_time = normalize(X_time, self._time_norm, axis=1, copy=True) ## Make Probability Predictions y_lon_pred_prob = self.time_classifier.predict_proba(X_time) ## Get Time Bins time_bins_classifier = self._time_bins[self.time_classifier.classes_] coordinate_lon_bins = np.array(list(map(lambda v: assign_value_to_bin(v, self._time_bins), coordinates[:,0]))) ## Update Probabilities P_time = y_lon_pred_prob[:, coordinate_lon_bins] P_time = normalize(P_time, axis=1, norm="l1") P = np.multiply(P, P_time) ## Normalize Posterior P = np.divide(P, np.nansum(P, axis=1).reshape(-1,1), out=np.ones_like(P) * np.nan, where = np.nansum(P, axis=1).reshape(-1,1) > 0) return coordinates, P def predict(self, X, coordinates=None): """ Args: X (csr matrix): Feature Matrix coordinates (2d-array or None): Coordinates to make predictions over Returns: y_pred (2d-array): Coordinate Predictions (argmax) """ ## Get Probability Prediction coordinates, P = self.predict_proba(X, coordinates) ## Get Argmax over Coordinates y_pred = coordinates[P.argmax(axis=1)] return y_pred def plot_model_posterior(self, feature, coordinates=None): """ Plot the probabilitiy distribution over coordinates for a given feature name Args: feature (str): Name of the feature to plot distribution for coordinates (2d-array or None): If desired, a subset of coordinates to use for plotting a distribution. By default, creates a coordinate grid Returns: fig, ax (matplotlib objects): Figure object """ ## Check for Feature/Model if feature not in self._vocabulary.feature_to_idx: raise KeyError(f"Feature=`{feature} not found") if self._models[self._vocabulary.feature_to_idx[feature]] is None: raise ValueError(f"Model for Feature={feature} is null") ## Coordinate Grid if coordinates is None: coordinates = self._create_coordinate_grid(cell_size=2) ## Make Posterior Predictions m = self._models[self._vocabulary.feature_to_idx[feature]] posterior = np.exp(m.score_samples(coordinates)) ## Plot fig, ax = plt.subplots(figsize=(10,5.8)) s = ax.scatter(coordinates[:,0], coordinates[:,1], c = posterior, cmap = plt.cm.coolwarm, alpha = .8, s = 5) cbar = fig.colorbar(s) ax.set_xlabel("Longitude") ax.set_ylabel("Latitude") ax.set_title(feature, loc="left") fig.tight_layout() return fig, ax
def colour(img, scale=1.0, samples=10000): '''Model the distribution of colours in an image. The method models the distribution of the values in the chroma channels of an image after being converted from RGB to CIE Lab. This decouples the luma (intensity) values from the chroma (colour) information, make it easier to visualize how the colours themselves appear. The resulting visualization is the same size as the original image. Parameters ---------- img : numpy.ndarray input image scale : float image scaling factor samples : int number of samples to draw when generating the density estimate Returns ------- numpy.ndarray a new image, same dimensions as the input, visualizing the colour distribution Raises ------ ValueError if the input image is not an RGB image ''' if img.ndim != 3: raise ValueError('Require RGB image to compute structure tensor.') img = skimage.transform.rescale(img, 1.0 / scale, anti_aliasing=True, mode='constant', multichannel=True) img = skimage.color.rgb2hsv(img) height, width = img.shape[0:2] # Extract the colour vectors and sample from them. ind = generate_samples(width, height, samples) X = np.squeeze(img[ind[1, :], ind[0, :], 0:2]) # Convert a polar to cartesian coordinate conversation (will make the # visualization easier). mag = X[:, 1] ang = 2 * np.pi * X[:, 0] X[:, 0] = mag * np.cos(ang) X[:, 1] = mag * np.sin(ang) # Perform a density estimation using a GMM. gmm = BayesianGaussianMixture( n_components=25, weight_concentration_prior_type='dirichlet_distribution', weight_concentration_prior=1e-3) gmm.fit(X) # Generate the output array. x, y = np.meshgrid(np.linspace(-1, 1, width), np.linspace(-1, 1, height)) X = np.c_[x.flatten(), y.flatten()] scores = np.exp(gmm.score_samples(X)) max_score = np.max(scores) # Apply a gamma correction to make the image look a bit nicer. val = np.reshape(scores, (height, width)) / max_score val = skimage.exposure.adjust_gamma(val, gamma=0.3) # Convert back from HSV to RGB. The saturation needs to be clamped so that # it doesn't produce invalid values during the HSV->RGB conversion. mag = x**2 + y**2 sat = np.sqrt(mag) sat[sat > 1] = 1 # The hue also needs to be adjust since atan2() returns a value between # -pi and pi, but the hue needs to be between 0 an 1. hue = np.arctan2(y, x) hue[hue < 0] = hue[hue < 0] + 2 * np.pi hue /= 2 * np.pi output = np.dstack((hue, sat, val)) output = skimage.color.hsv2rgb(output) output = skimage.transform.rescale(output, scale, anti_aliasing=True, mode='constant', multichannel=True) return output
def gmm_entropy(points, n_est=None, n_components=None): r""" Use sklearn.mixture.BayesianGaussianMixture to estimate entropy. *points* are the data points in the sample. *n_est* are the number of points to use in the estimation; default is 10,000 points, or 0 for all the points. *n_components* are the number of Gaussians in the mixture. Default is $5 \sqrt{d}$ where $d$ is the number of dimensions. Returns estimated entropy and uncertainty in the estimate. This method uses BayesianGaussianMixture from scikit-learn to build a model of the point distribution, then uses Monte Carlo sampling to determine the entropy of that distribution. The entropy uncertainty is computed from the variance in the MC sample scaled by the number of samples. This does not incorporate any uncertainty in the sampling that generated the point distribution or the uncertainty in the GMM used to model that distribution. """ #from sklearn.mixture import GaussianMixture as GMM from sklearn.mixture import BayesianGaussianMixture as GMM n, d = points.shape # Default to the full set if n_est is None: n_est = 10000 elif n_est == 0: n_est = n # reduce size of draw to n_est if n_est >= n: x = points n_est = n else: x = points[permutation(n)[:n_est]] n = n_est if n_components is None: n_components = int(5 * sqrt(d)) ## Standardization doesn't seem to help ## Note: sigma may be zero #x, mu, sigma = standardize(x) # if standardized predictor = GMM( n_components=n_components, covariance_type='full', #verbose=True, max_iter=1000) predictor.fit(x) eval_x, _ = predictor.sample(n_est) weight_x = predictor.score_samples(eval_x) H = -np.mean(weight_x) #with np.errstate(divide='ignore'): H = H + np.sum(np.log(sigma)) # if standardized dH = np.std(weight_x, ddof=1) / sqrt(n) ## cross-check against own calcs #alt = GaussianMixture(predictor.weights_, mu=predictor.means_, sigma=predictor.covariances_) #print("alt", H, alt.entropy()) #print(np.vstack((weight_x[:10], alt.logpdf(eval_x[:10]))).T) return H / LN2, dH / LN2
for l, s, w in zip(mu, sigma, weights): y_axis += ss.norm.pdf(x_axis, loc=l, scale=s) * w K = 11 y_axis_gmm = np.zeros_like(x_axis) gmm = GaussianMixture(K) gmm.fit(samples.reshape(-1, 1)) for l, s, w in zip(gmm.means_.flatten(), gmm.covariances_.flatten(), gmm.weights_): y_axis_gmm += ss.norm.pdf(x_axis, loc=l, scale=np.sqrt(s)) * w y_axis_bgmm = np.zeros_like(x_axis) bgmm = BayesianGaussianMixture(K) bgmm.fit(samples.reshape(-1, 1)) y_axis_bgmm = np.exp(bgmm.score_samples(x_axis.reshape(-1, 1))) # plt.plot(x_axis, y_axis, label='Densidad real') # plt.hist(samples, normed=True, bins="fd", alpha=1, label='Muestras') # plt.plot(samples,np.zeros(len(samples)), marker='.', label='Muestras') # plt.plot(x_axis, y_axis_gmm, label='Estimacion ML') # plt.plot(x_axis, y_axis_bgmm, label='Estimacion bayesiana') # plt.xlabel("x") # plt.ylabel("f(x)") # plt.xlim([-2, 10]) # plt.ylim([0, 0.32]) # plt.legend() # plt.savefig('figures/bayes_soluciones') # plt.savefig('figures/ml_problemas') # plt.show()
model3.fit(X_std) # 確認 pprint(vars(model3)) # 3 クラスタ数ごとの可視化 ------------------------------------------------------------------------- #プロットのサイズ指定 plt.figure(figsize=(8, 4)) # 色とプロリンの散布図のVBGMMによるクラスタリング x = np.linspace(X_std[:, 0].min(), X_std[:, 0].max(), 100) y = np.linspace(X_std[:, 0].min(), X_std[:, 0].max(), 100) X, Y = np.meshgrid(x, y) XX = np.array([X.ravel(), Y.ravel()]).T Z = -model3.score_samples(XX) Z = Z.reshape(X.shape) plt.contour(X, Y, Z, levels=[0.5, 1, 2, 3, 4, 5]) # 等高線のプロット plt.scatter(X_std[:, 0], X_std[:, 1], c=model3.predict(X_std)) plt.title('VBGMM(covariance_type=full)') plt.show() # 4 クラスタリングの予測 ------------------------------------------------------------------------- # 予測 model3.predict(X_std) # 混合係数 # --- ウエイトなので合計1となる