예제 #1
0
    def predict(self, X_in, y_in, X_te_in):
        X_tr = X_in.copy()
        Y_tr = y_in.copy()
        X_oos = X_te_in.copy()

        unique_class_vals = Y_tr.unique()

        best_params = self.best_params

        gmm_dict = {class_val: 0 for class_val in unique_class_vals}
        for class_val in unique_class_vals:
            gmm = BayesianGaussianMixture(
                n_components=best_params[class_val]['n_components'],
                covariance_type=best_params[class_val]['covariance_type'],
                weight_concentration_prior=best_params[class_val]
                ['weight_concentration_prior'],
                reg_covar=1).fit(X_tr[Y_tr == class_val],
                                 Y_tr[Y_tr == class_val])

            gmm_dict[class_val] = np.exp(gmm.score_samples(X_oos))

        gmm_df = pd.DataFrame.from_dict(gmm_dict)
        res_df = gmm_df.idxmax(axis=1)

        self.y_pred = res_df.values

        return res_df.values
예제 #2
0
 def run(self):
     args = self.args
     uniblock_path = self._get_uniblock_path()
     feature = load(os.path.join(uniblock_path, 'feature.dump'))
     X = feature.get_feature_matrix(args.corpus_path)
     legal, mask = self._infer_nonzero(X)
     dump(legal, os.path.join(uniblock_path, 'legal.dump'))
     dump(mask, os.path.join(uniblock_path, 'mask.dump'))
     X = X[:, legal]
     bgm = BayesianGaussianMixture(
         n_components=args.k,
         covariance_type=args.cov,
         max_iter=200,
         random_state=0,
         verbose=0 if not args.verbose else 2,
         verbose_interval=1,
         tol=args.tol,
         n_init=args.n_init,
         init_params=args.init_params,
     )
     bgm.fit(X)
     dump(bgm, os.path.join(uniblock_path, 'bgm.dump'))
     scores = bgm.score_samples(X)
     self._log_scores(scores, args.corpus_path)
     self._log_stats(scores, args.corpus_path)
    def __init__(self, **kwargs):
        super().__init__(data_set=kwargs.pop('data_set', None), **kwargs)

        self.clf_ = kwargs.get('clf', None)
        if self.clf_ is None:
            raise ValueError("missing required keyword-only argument 'clf'")
        if not callable(getattr(self.clf_, 'fit', None)) or not callable(
            (getattr(self.clf_, 'predict_proba', None))):
            raise TypeError(
                "'clf' must be an instance with the methods 'fit' and 'predict_proba'"
            )

        n_components = int(
            kwargs.pop('n_components', np.min([20, len(self.data_set_)])))
        if n_components < 0 or n_components > len(self.data_set_):
            raise ValueError(
                "'n_components' must be an integer in the interval [1, n_samples]"
            )

        # fit Gaussian mixture model for pre-clustering
        gmm = BayesianGaussianMixture(n_components=n_components,
                                      covariance_type='spherical',
                                      max_iter=1000,
                                      random_state=self.random_state_)
        gmm.fit(self.data_set_.X_)
        self.y_cluster_ = gmm.predict(self.data_set_.X_)
        self.p_x_ = np.exp(gmm.score_samples(self.data_set_.X_))
    def train_gmm(self, samples):
        co_type = 'tied'
        gmm = BayesianGaussianMixture(n_components=2, covariance_type=co_type, n_init=10, random_state=0, max_iter=500,
                                      verbose=1)

        samples = cv2.cvtColor(samples.reshape(1, -1, 3), cv2.COLOR_BGR2YCrCb)
        samples = samples.reshape(-1, 3)
        gmm.fit(samples)

        max_prob = np.max(gmm.score_samples(samples))
        return gmm, max_prob
예제 #5
0
def bayesianGMM(points, allPoints, numComponents):
    if len(allPoints) < numComponents:
        numComponents = len(allPoints)
    clf = BayesianGaussianMixture(
        weight_concentration_prior_type="dirichlet_process",
        weight_concentration_prior=100000,
        n_components=2 * numComponents,
        reg_covar=0,
        init_params='random',
        max_iter=1500,
        mean_precision_prior=.8,
        random_state=2)
    clf.fit(allPoints)
    return np.exp(clf.score_samples(points))
예제 #6
0
    def fit(self, X, y):
        """
        y must be composed of 0 and 1
        """
        self.gmms_ = {0: [], 1: []}
        ll_list = []
        for n in self.n_clusters_list:
            gmm0 = BayesianGaussianMixture(n_components=n,
                                           covariance_type='full',
                                           random_state=self.random_state).fit(
                                               X[y == 0])
            gmm1 = BayesianGaussianMixture(n_components=n,
                                           covariance_type='full',
                                           random_state=self.random_state).fit(
                                               X[y == 1])
            self.gmms_[0].append(gmm0)
            self.gmms_[1].append(gmm1)
            ll = gmm1.score_samples(X) - gmm0.score_samples(X)
            ll_list.append(ll)

        ll_arr = np.stack(ll_list, axis=1)
        self.comb_ = LogisticRegression(solver='lbfgs').fit(ll_arr, y)
        return self
예제 #7
0
def denoise(times, waveforms):
    threshold = np.log(0.001)

    pcaed = PCA(n_components=2).fit_transform(waveforms)
    pcaed = scipy.stats.zscore(pcaed, axis=0)
    mix = BayesianGaussianMixture(n_components=2).fit(pcaed)
    logprob = mix.score_samples(pcaed)
    times = times[logprob > threshold]
    waveforms = waveforms[logprob > threshold]

    denoised = denoising_sort(times, waveforms)
    # denoised = denoised.select([isi(n) < 0.05 for n in denoised.nodes])

    denoised = cluster_step(denoised,
                            dpoints=200,
                            n_components=10,
                            min_cluster_size=3,
                            mode="kmeans")
    # denoised = denoised.select([isi(n) < 0.05 for n in denoised.nodes])
    return denoised
예제 #8
0
def embed_mixture_variational(xmaps_np,
                              n_components,
                              ):
    sample_by_feature = np.vstack([np_map.flatten()
                                   for dtag, np_map
                                   in xmaps_np.items()
                                   ]
                                  )

    # mixture = BayesianGaussianMixture()

    begin = time.time()

    pca = PCA(n_components=50)
    sample_by_feature_pca = pca.fit_transform(sample_by_feature)

    print("shape is: {}".format(sample_by_feature_pca.shape))

    mixture = BayesianGaussianMixture(n_components,
                                      covariance_type="spherical",
                                      verbose=10,
                                      verbose_interval=2,
                                      )
    mixture.fit(sample_by_feature_pca)

    finish = time.time()

    print(mixture)

    print("Finished in {}".format(finish - begin))

    print(mixture.predict(sample_by_feature_pca))

    print(mixture.weights_)

    clusters = mixture.predict(sample_by_feature_pca)

    probabilities = mixture.score_samples(sample_by_feature_pca)

    return mixture, pca, clusters, probabilities
예제 #9
0
파일: entropy.py 프로젝트: bumps/bumps
def gmm_entropy(points, n_est=None, n_components=None):
    #from sklearn.mixture import GaussianMixture as GMM
    from sklearn.mixture import BayesianGaussianMixture as GMM
    n, d = points.shape

    # Default to the full set
    if n_est is None:
        n_est = n

    # reduce size of draw to n_est
    if n_est >= n:
        x = points
    else:
        x = points[permutation(n)[:n_est]]
        n = n_est

    if n_components is None:
        n_components = int(5*sqrt(d))

    ## Standardization doesn't seem to help
    ## Note: sigma may be zero
    #x, mu, sigma = standardize(x)   # if standardized
    predictor = GMM(n_components=n_components, covariance_type='full',
                    #verbose=True,
                    max_iter=1000)
    predictor.fit(x)
    eval_x, _ = predictor.sample(n_est)
    weight_x = predictor.score_samples(eval_x)
    H = -np.mean(weight_x)
    #with np.errstate(divide='ignore'): H = H + np.sum(np.log(sigma))   # if standardized
    dH = 0.
    ## cross-check against own calcs
    #alt = GaussianMixture(predictor.weights_, mu=predictor.means_, sigma=predictor.covariances_)
    #print("alt", H, alt.entropy())
    #print(np.vstack((weight_x[:10], alt.logpdf(eval_x[:10]))).T)
    return H / LN2, dH / LN2
예제 #10
0
class BayesianGMMOutlierDetector(OutlierMixin, BaseEstimator):
    """
    The GMMDetector trains a Bayesian Gaussian Mixture Model on a dataset X. Once
    a density is trained we can evaluate the likelihood scores to see if
    it is deemed likely. By giving a threshold this model might then label
    outliers if their likelihood score is too low.

    :param threshold: the limit at which the model thinks an outlier appears, must be between (0, 1)
    :param method: the method that the threshold will be applied to, possible values = [stddev, default=quantile]

    If you select method="quantile" then the threshold value represents the
    quantile value to start calling something an outlier.

    If you select method="stddev" then the threshold value represents the
    numbers of standard deviations before calling something an outlier.

    There are other settings too, these are best described in the BayesianGaussianMixture
    documentation found here:

    https://scikit-learn.org/stable/modules/generated/sklearn.mixture.BayesianGaussianMixture.html.
    """
    def __init__(
        self,
        threshold=0.99,
        method="quantile",
        n_components=1,
        covariance_type="full",
        tol=0.001,
        reg_covar=1e-06,
        max_iter=100,
        n_init=1,
        init_params="kmeans",
        weight_concentration_prior_type="dirichlet_process",
        weight_concentration_prior=None,
        mean_precision_prior=None,
        mean_prior=None,
        degrees_of_freedom_prior=None,
        covariance_prior=None,
        random_state=None,
        warm_start=False,
        verbose=0,
        verbose_interval=10,
    ):
        self.threshold = threshold
        self.method = method
        self.allowed_methods = ["quantile", "stddev"]

        self.n_components = n_components
        self.covariance_type = covariance_type
        self.tol = tol
        self.reg_covar = reg_covar
        self.max_iter = max_iter
        self.n_init = n_init
        self.init_params = init_params
        self.weight_concentration_prior_type = weight_concentration_prior_type
        self.weight_concentration_prior = weight_concentration_prior
        self.mean_precision_prior = mean_precision_prior
        self.mean_prior = mean_prior
        self.degrees_of_freedom_prior = degrees_of_freedom_prior
        self.covariance_prior = covariance_prior
        self.random_state = random_state
        self.warm_start = warm_start
        self.verbose = verbose
        self.verbose_interval = verbose_interval

    def fit(self, X: np.array, y=None) -> "BayesianGMMOutlierDetector":
        """
        Fit the model using X, y as training data.

        :param X: array-like, shape=(n_columns, n_samples,) training data.
        :param y: ignored but kept in for pipeline support
        :return: Returns an instance of self.
        """

        # GMM sometimes throws an error if you don't do this
        X = check_array(X, estimator=self, dtype=FLOAT_DTYPES)
        if len(X.shape) == 1:
            X = np.expand_dims(X, 1)

        if (self.method == "quantile") and ((self.threshold > 1) or
                                            (self.threshold < 0)):
            raise ValueError(
                f"Threshold {self.threshold} with method {self.method} needs to be 0 < threshold < 1"
            )
        if (self.method == "stddev") and (self.threshold < 0):
            raise ValueError(
                f"Threshold {self.threshold} with method {self.method} needs to be 0 < threshold "
            )
        if self.method not in self.allowed_methods:
            raise ValueError(
                f"Method not recognised. Method must be in {self.allowed_methods}"
            )

        self.gmm_ = BayesianGaussianMixture(
            n_components=self.n_components,
            covariance_type=self.covariance_type,
            tol=self.tol,
            reg_covar=self.reg_covar,
            max_iter=self.max_iter,
            n_init=self.n_init,
            init_params=self.init_params,
            weight_concentration_prior_type=self.
            weight_concentration_prior_type,
            weight_concentration_prior=self.weight_concentration_prior,
            mean_precision_prior=self.mean_precision_prior,
            mean_prior=self.mean_prior,
            degrees_of_freedom_prior=self.degrees_of_freedom_prior,
            covariance_prior=self.covariance_prior,
            random_state=self.random_state,
            warm_start=self.warm_start,
            verbose=self.verbose,
            verbose_interval=self.verbose_interval,
        )
        self.gmm_.fit(X)
        score_samples = self.gmm_.score_samples(X)

        if self.method == "quantile":
            self.likelihood_threshold_ = np.quantile(score_samples,
                                                     1 - self.threshold)

        if self.method == "stddev":
            density = gaussian_kde(score_samples)
            max_x_value = minimize_scalar(lambda x: -density(x)).x
            mean_likelihood = score_samples.mean()
            new_likelihoods = score_samples[score_samples < max_x_value]
            new_likelihoods_std = np.std(new_likelihoods - mean_likelihood)
            self.likelihood_threshold_ = mean_likelihood - (
                self.threshold * new_likelihoods_std)

        return self

    def score_samples(self, X):
        X = check_array(X, estimator=self, dtype=FLOAT_DTYPES)
        check_is_fitted(self, ["gmm_", "likelihood_threshold_"])
        if len(X.shape) == 1:
            X = np.expand_dims(X, 1)

        return self.gmm_.score_samples(X) * -1

    def decision_function(self, X):
        # We subtract self.offset_ to make 0 be the threshold value for being an outlier:
        return self.score_samples(X) + self.likelihood_threshold_

    def predict(self, X):
        """
        Predict if a point is an outlier.
        :param X: array-like, shape=(n_columns, n_samples, ) training data.
        :return: array, shape=(n_samples,) the predicted data. 1 for inliers, -1 for outliers.
        """
        predictions = (self.decision_function(X) >= 0).astype(np.int)
        predictions[predictions == 1] = -1
        predictions[predictions == 0] = 1
        return predictions
예제 #11
0
    def execute(self, namespace):
        from sklearn.mixture import GaussianMixture, BayesianGaussianMixture
        from PYME.IO import MetaDataHandler

        points = namespace[self.input_points]
        X = np.stack([points['x'], points['y'], points['z']], axis=1)

        if self.mode == 'n':
            gmm = GaussianMixture(n_components=self.n,
                                  covariance_type=self.covariance,
                                  max_iter=self.max_iter,
                                  init_params=self.init_params)
            predictions = gmm.fit_predict(X) + 1  # PYME labeling scheme
            log_prob = gmm.score_samples(X)
            if not gmm.converged_:
                logger.error('GMM fitting did not converge')
                predictions = np.zeros(len(points), int)
                log_prob = -np.inf * np.ones(len(points))

        elif self.mode == 'bic':
            n_components = range(1, self.n + 1)
            bic = np.zeros(len(n_components))
            for ind in range(len(n_components)):
                gmm = GaussianMixture(n_components=n_components[ind],
                                      covariance_type=self.covariance,
                                      max_iter=self.max_iter,
                                      init_params=self.init_params)
                gmm.fit(X)
                bic[ind] = gmm.bic(X)
                logger.debug('%d BIC: %f' % (n_components[ind], bic[ind]))

            best = n_components[np.argmin(bic)]
            if best == self.n or (self.n > 10 and best > 0.9 * self.n):
                logger.warning(
                    'BIC optimization selected n components near n max')

            gmm = GaussianMixture(n_components=best,
                                  covariance_type=self.covariance,
                                  max_iter=self.max_iter,
                                  init_params=self.init_params)
            predictions = gmm.fit_predict(X) + 1  # PYME labeling scheme
            log_prob = gmm.score_samples(X)
            if not gmm.converged_:
                logger.error('GMM fitting did not converge')
                predictions = np.zeros(len(points), int)
                log_prob = -np.inf * np.ones(len(points))

        elif self.mode == 'bayesian':
            bgm = BayesianGaussianMixture(n_components=self.n,
                                          covariance_type=self.covariance,
                                          max_iter=self.max_iter,
                                          init_params=self.init_params)
            predictions = bgm.fit_predict(X) + 1  # PYME labeling scheme
            log_prob = bgm.score_samples(X)
            if not bgm.converged_:
                logger.error('GMM fitting did not converge')
                predictions = np.zeros(len(points), int)
                log_prob = -np.inf * np.ones(len(points))

        out = tabular.MappingFilter(points)
        try:
            out.mdh = MetaDataHandler.DictMDHandler(points.mdh)
        except AttributeError:
            pass

        out.addColumn(self.label_key, predictions)
        out.addColumn(self.label_key + '_log_prob', log_prob)
        avg_log_prob = np.empty_like(log_prob)
        for label in np.unique(predictions):
            mask = label == predictions
            avg_log_prob[mask] = np.mean(log_prob[mask])
        out.addColumn(self.label_key + '_avg_log_prob', avg_log_prob)
        namespace[self.output_labeled] = out
예제 #12
0
class GeolocationInference(object):
    
    """
    Geolocation Inference Model (GMM)
    """

    def __init__(self,
                 vocabulary,
                 n_time_bins=10,
                 time_norm=None,
                 time_as_percentile=True,
                 time_model_kwargs = {"Cs":10, "solver":'lbfgs', "n_jobs":-1},
                 mixture_kwargs = {"n_components":5,"covariance_type":"diag"},
                 random_state = 42):
        """
        Geolocation inference model based on gaussian mixture model density estimates

        Args:
            vocabulary (object): Learned vocabulary
            n_time_bins (int): Number of temporal bins to use for training
            time_norm (str or None): How to normalize temporal features
            time_as_percentile (bool): If True, creates bins using percentile distribution
                                       over longitude instead of linear spacing.
            time_model_kwargs (dict): Arguments for logistic regression temporal estimator
            mixture_kwargs (dict): Parameters for the gaussian mixture model estimators
            random_state (int): Random seed for fitting estimators
        """
        ## Class Attributes/Parameters
        self._vocabulary = vocabulary
        self._n_time_bins = n_time_bins
        self._time_norm = time_norm
        self._time_as_percentile = time_as_percentile
        self._time_model_kwargs = time_model_kwargs
        self._mixture_kwargs = mixture_kwargs
        self._random_state = random_state
        ## Check Arguments
        if "random_state" not in self._mixture_kwargs:
            self._mixture_kwargs["random_state"] = self._random_state
        if "random_state" not in self._time_model_kwargs:
            self._time_model_kwargs["random_state"] = self._random_state

    def __repr__(self):
        """
        Human-readable string describing the class

        Args:
            None
        
        Returns:
            desc (str): Description of class
        """
        return "GeolocationInference()"
    
    def _create_coordinate_grid(self,
                                cell_size = 1):
        """
        Create a grid of lon/lat points based on training
        boundaries

        Args:
            cell_size (float): Degrees contained within each grid cell
        
        Returns:
            coordinate_grid (array): Coordinate grid (land-mass only)
        """
        ## Boundaries
        xmin = int(self._coord_bounds[0][0] - 1)
        xmax = int(self._coord_bounds[0][1] + 1)
        ymin = int(self._coord_bounds[1][0] - 1)
        ymax = int(self._coord_bounds[1][1] + 1)
        ## Coordinates
        lon_coord = [xmin]
        lat_coord = [ymin]
        while lon_coord[-1] < xmax:
            lon_coord.append(min(lon_coord[-1] + cell_size, xmax))
        while lat_coord[-1] < ymax:
            lat_coord.append(min(lat_coord[-1] + cell_size, ymax))
        coordinate_grid = []
        for x in lon_coord:
            for y in lat_coord:
                if globe.is_land(y, x):
                    coordinate_grid.append([x, y])
        coordinate_grid = np.array(coordinate_grid)
        return coordinate_grid
    

    def _fit_mixture(self,
                     i,
                     X,
                     y):
        """
        Fit a single mixture model, update cache in place

        Args:
            i (int): Index of the feature being trained on
            X (csr matrix): Input feature matrix
            y (2d-array): Lon/Lat coordinates for training
        
        Returns:
            None
        """
        ## Construct Training Sample
        S = []
        nonzero = np.nonzero(X[:,i])[0]
        for n in nonzero:
            x_s = int(X[n, i])
            y_s = y[n]
            S.extend([y_s] * x_s)
        S = np.vstack(S)
        ## Initialize Model
        n_components = min(self._mixture_kwargs["n_components"], len(nonzero))
        args = self._mixture_kwargs.copy()
        args["n_components"] = n_components
        model = BayesianGaussianMixture(**args)
        ## Fit Model
        model = model.fit(S)
        ## Cache Model
        self._models[i] = model

    def fit(self,
            X,
            y):
        """
        Args:
            X (csr matrix): Sparse feature matrix
            y (2d-array): Training coordinates
        
        Returns:
            self
        """
        ## Coordinate Grid Boundaries
        self._coord_bounds = [[y[:,0].min(), y[:,0].max()], [y[:,1].min(), y[:,1].max()]]
        ## Initialize Mixture Model Cache
        m = len(self._vocabulary._feature_inds["text"]) + \
            len(self._vocabulary._feature_inds["subreddit"])
        self._models = [None for _ in range(m)]
        ## Feature Probabilities
        self._pu = np.array((X > 0).sum(axis=0) / X.shape[0])[0]
        ## Fit Prior
        self.prior = BayesianGaussianMixture(**self._mixture_kwargs)
        self.prior.fit(y)
        ## Fit Mixture Models
        for i in tqdm(range(m), desc="GMM Fit", total = m, file=sys.stdout):
            self._fit_mixture(i, X, y)
        ## Temporal Model
        if self._vocabulary._use_time:
            ## Isolate Time Data
            X_time = X[:, self._vocabulary._feature_inds["time"]]
            ## Normalize
            if self._time_norm is not None:
                X_time = normalize(X_time, self._time_norm, axis=1, copy=True)
            ## Create Time Bins
            y_lon = y[:,0]
            if self._time_as_percentile:
                percentiles = np.linspace(0, 100, self._n_time_bins + 1)[:-1]
                self._time_bins = np.percentile(y_lon, percentiles)
            else:
                lon_bounds = int(y_lon.min() - 1), int(y_lon.max() + 1)
                self._time_bins = np.linspace(lon_bounds[0], lon_bounds[1], self._n_time_bins + 1)
            y_lon = np.array(list(map(lambda v: assign_value_to_bin(v, self._time_bins), y_lon)))
            ## Train Model
            self._time_model_kwargs["scoring"] = f1_score
            self._time_model_kwargs["max_iter"] = 1000
            self.time_classifier = LogisticRegressionCV(**self._time_model_kwargs)
            self.time_classifier.fit(X_time, y_lon)
        return self
        
    def predict_proba(self,
                      X,
                      coordinates = None):
        """
        Args:
            X (csr matrix): Sparse feature matrix
            coordinates (2d-array): Coordinates to make predictions on. If None, creates grid
                                    around the world (land-masses only)
        
        Returns:
            coordinates (2d-array): Coordinate array associated with prediction columns
            P (2d-array): Predicted probabilities per coordinate
        """
        ## Coordinate Grid
        if coordinates is None:
            coordinates = self._create_coordinate_grid()
        ## Transform X Shape
        X = X.toarray()
        ## Compute Prior for Elements With Missing
        prior = np.exp(self.prior.score_samples(coordinates))
        ## Initialize Probability Array
        P = np.zeros((X.shape[0], len(coordinates)))
        ## Cycle Through Models
        for ind, model in tqdm(enumerate(self._models),
                               total = len(self._models),
                               file=sys.stdout,
                               desc="GMM Posterior"):
            if model is None:
                continue
            u = X[:,[ind]]
            nonzero = np.nonzero(u)[0]
            if len(nonzero) == 0:
                continue
            p_c_u = np.exp(model.score_samples(coordinates)).reshape(1,-1)
            P[nonzero] += np.matmul(u[nonzero], p_c_u * self._pu[ind])
        ## Default to Prior for Users Without Features
        P[np.all(P == 0, axis=1)] += prior
        ## Time Adjustment
        if self._vocabulary._use_time:
            ## Isolate Time Data
            X_time = X[:, self._vocabulary._feature_inds["time"]]
            ## Normalize
            if self._time_norm is not None:
                X_time = normalize(X_time, self._time_norm, axis=1, copy=True)
            ## Make Probability Predictions
            y_lon_pred_prob = self.time_classifier.predict_proba(X_time)
            ## Get Time Bins
            time_bins_classifier = self._time_bins[self.time_classifier.classes_]
            coordinate_lon_bins = np.array(list(map(lambda v: assign_value_to_bin(v, self._time_bins),
                                                    coordinates[:,0])))
            ## Update Probabilities
            P_time = y_lon_pred_prob[:, coordinate_lon_bins]
            P_time = normalize(P_time, axis=1, norm="l1")
            P = np.multiply(P, P_time)
        ## Normalize Posterior
        P = np.divide(P,
                      np.nansum(P, axis=1).reshape(-1,1),
                      out=np.ones_like(P) * np.nan,
                      where = np.nansum(P, axis=1).reshape(-1,1) > 0)
        return coordinates, P

    def predict(self,
                X,
                coordinates=None):
        """
        Args:
            X (csr matrix): Feature Matrix
            coordinates (2d-array or None): Coordinates to make predictions over
        
        Returns:
            y_pred (2d-array): Coordinate Predictions (argmax)
        """
        ## Get Probability Prediction
        coordinates, P = self.predict_proba(X, coordinates)
        ## Get Argmax over Coordinates
        y_pred = coordinates[P.argmax(axis=1)]
        return y_pred
    
    def plot_model_posterior(self,
                             feature,
                             coordinates=None):
        """
        Plot the probabilitiy distribution over coordinates for a given
        feature name

        Args:
            feature (str): Name of the feature to plot distribution for
            coordinates (2d-array or None): If desired, a subset of coordinates
                                            to use for plotting a distribution.
                                            By default, creates a coordinate grid
        
        Returns:
            fig, ax (matplotlib objects): Figure object
        """
        ## Check for Feature/Model
        if feature not in self._vocabulary.feature_to_idx:
            raise KeyError(f"Feature=`{feature} not found")
        if self._models[self._vocabulary.feature_to_idx[feature]] is None:
            raise ValueError(f"Model for Feature={feature} is null")
        ## Coordinate Grid
        if coordinates is None:
            coordinates = self._create_coordinate_grid(cell_size=2)
        ## Make Posterior Predictions
        m = self._models[self._vocabulary.feature_to_idx[feature]]
        posterior = np.exp(m.score_samples(coordinates))
        ## Plot 
        fig, ax = plt.subplots(figsize=(10,5.8))
        s = ax.scatter(coordinates[:,0],
                       coordinates[:,1],
                       c = posterior,
                       cmap = plt.cm.coolwarm,
                       alpha = .8,
                       s = 5)
        cbar = fig.colorbar(s)
        ax.set_xlabel("Longitude")
        ax.set_ylabel("Latitude")
        ax.set_title(feature, loc="left")
        fig.tight_layout()
        return fig, ax
예제 #13
0
def colour(img, scale=1.0, samples=10000):
    '''Model the distribution of colours in an image.

    The method models the distribution of the values in the chroma channels of
    an image after being converted from RGB to CIE Lab.  This decouples the
    luma (intensity) values from the chroma (colour) information, make it
    easier to visualize how the colours themselves appear.  The resulting
    visualization is the same size as the original image.

    Parameters
    ----------
    img : numpy.ndarray
        input image
    scale : float
        image scaling factor
    samples : int
        number of samples to draw when generating the density estimate

    Returns
    -------
    numpy.ndarray
        a new image, same dimensions as the input, visualizing the colour
        distribution

    Raises
    ------
    ValueError
        if the input image is not an RGB image
    '''
    if img.ndim != 3:
        raise ValueError('Require RGB image to compute structure tensor.')

    img = skimage.transform.rescale(img,
                                    1.0 / scale,
                                    anti_aliasing=True,
                                    mode='constant',
                                    multichannel=True)
    img = skimage.color.rgb2hsv(img)
    height, width = img.shape[0:2]

    # Extract the colour vectors and sample from them.
    ind = generate_samples(width, height, samples)
    X = np.squeeze(img[ind[1, :], ind[0, :], 0:2])

    # Convert a polar to cartesian coordinate conversation (will make the
    # visualization easier).
    mag = X[:, 1]
    ang = 2 * np.pi * X[:, 0]

    X[:, 0] = mag * np.cos(ang)
    X[:, 1] = mag * np.sin(ang)

    # Perform a density estimation using a GMM.
    gmm = BayesianGaussianMixture(
        n_components=25,
        weight_concentration_prior_type='dirichlet_distribution',
        weight_concentration_prior=1e-3)
    gmm.fit(X)

    # Generate the output array.
    x, y = np.meshgrid(np.linspace(-1, 1, width), np.linspace(-1, 1, height))
    X = np.c_[x.flatten(), y.flatten()]
    scores = np.exp(gmm.score_samples(X))
    max_score = np.max(scores)

    # Apply a gamma correction to make the image look a bit nicer.
    val = np.reshape(scores, (height, width)) / max_score
    val = skimage.exposure.adjust_gamma(val, gamma=0.3)

    # Convert back from HSV to RGB.  The saturation needs to be clamped so that
    # it doesn't produce invalid values during the HSV->RGB conversion.
    mag = x**2 + y**2
    sat = np.sqrt(mag)
    sat[sat > 1] = 1

    # The hue also needs to be adjust since atan2() returns a value between
    # -pi and pi, but the hue needs to be between 0 an 1.
    hue = np.arctan2(y, x)
    hue[hue < 0] = hue[hue < 0] + 2 * np.pi
    hue /= 2 * np.pi

    output = np.dstack((hue, sat, val))
    output = skimage.color.hsv2rgb(output)
    output = skimage.transform.rescale(output,
                                       scale,
                                       anti_aliasing=True,
                                       mode='constant',
                                       multichannel=True)

    return output
예제 #14
0
파일: entropy.py 프로젝트: llimeht/bumps
def gmm_entropy(points, n_est=None, n_components=None):
    r"""
    Use sklearn.mixture.BayesianGaussianMixture to estimate entropy.

    *points* are the data points in the sample.

    *n_est* are the number of points to use in the estimation; default is
    10,000 points, or 0 for all the points.

    *n_components* are the number of Gaussians in the mixture. Default is
    $5 \sqrt{d}$ where $d$ is the number of dimensions.

    Returns estimated entropy and uncertainty in the estimate.

    This method uses BayesianGaussianMixture from scikit-learn to build a
    model of the point distribution, then uses Monte Carlo sampling to
    determine the entropy of that distribution. The entropy uncertainty is
    computed from the variance in the MC sample scaled by the number of
    samples. This does not incorporate any uncertainty in the sampling that
    generated the point distribution or the uncertainty in the GMM used to
    model that distribution.
    """
    #from sklearn.mixture import GaussianMixture as GMM
    from sklearn.mixture import BayesianGaussianMixture as GMM
    n, d = points.shape

    # Default to the full set
    if n_est is None:
        n_est = 10000
    elif n_est == 0:
        n_est = n

    # reduce size of draw to n_est
    if n_est >= n:
        x = points
        n_est = n
    else:
        x = points[permutation(n)[:n_est]]
        n = n_est

    if n_components is None:
        n_components = int(5 * sqrt(d))

    ## Standardization doesn't seem to help
    ## Note: sigma may be zero
    #x, mu, sigma = standardize(x)   # if standardized
    predictor = GMM(
        n_components=n_components,
        covariance_type='full',
        #verbose=True,
        max_iter=1000)
    predictor.fit(x)
    eval_x, _ = predictor.sample(n_est)
    weight_x = predictor.score_samples(eval_x)
    H = -np.mean(weight_x)
    #with np.errstate(divide='ignore'): H = H + np.sum(np.log(sigma))   # if standardized
    dH = np.std(weight_x, ddof=1) / sqrt(n)
    ## cross-check against own calcs
    #alt = GaussianMixture(predictor.weights_, mu=predictor.means_, sigma=predictor.covariances_)
    #print("alt", H, alt.entropy())
    #print(np.vstack((weight_x[:10], alt.logpdf(eval_x[:10]))).T)
    return H / LN2, dH / LN2
예제 #15
0
for l, s, w in zip(mu, sigma, weights):
    y_axis += ss.norm.pdf(x_axis, loc=l, scale=s) * w

K = 11

y_axis_gmm = np.zeros_like(x_axis)
gmm = GaussianMixture(K)
gmm.fit(samples.reshape(-1, 1))
for l, s, w in zip(gmm.means_.flatten(), gmm.covariances_.flatten(),
                   gmm.weights_):
    y_axis_gmm += ss.norm.pdf(x_axis, loc=l, scale=np.sqrt(s)) * w

y_axis_bgmm = np.zeros_like(x_axis)
bgmm = BayesianGaussianMixture(K)
bgmm.fit(samples.reshape(-1, 1))
y_axis_bgmm = np.exp(bgmm.score_samples(x_axis.reshape(-1, 1)))

# plt.plot(x_axis, y_axis, label='Densidad real')
# plt.hist(samples, normed=True, bins="fd", alpha=1, label='Muestras')
# plt.plot(samples,np.zeros(len(samples)), marker='.', label='Muestras')
# plt.plot(x_axis, y_axis_gmm, label='Estimacion ML')
# plt.plot(x_axis, y_axis_bgmm, label='Estimacion bayesiana')
# plt.xlabel("x")
# plt.ylabel("f(x)")
# plt.xlim([-2, 10])
# plt.ylim([0, 0.32])
# plt.legend()
# plt.savefig('figures/bayes_soluciones')
# plt.savefig('figures/ml_problemas')
# plt.show()
예제 #16
0
model3.fit(X_std)

# 確認
pprint(vars(model3))

# 3 クラスタ数ごとの可視化 -------------------------------------------------------------------------

#プロットのサイズ指定
plt.figure(figsize=(8, 4))

# 色とプロリンの散布図のVBGMMによるクラスタリング
x = np.linspace(X_std[:, 0].min(), X_std[:, 0].max(), 100)
y = np.linspace(X_std[:, 0].min(), X_std[:, 0].max(), 100)
X, Y = np.meshgrid(x, y)
XX = np.array([X.ravel(), Y.ravel()]).T
Z = -model3.score_samples(XX)
Z = Z.reshape(X.shape)

plt.contour(X, Y, Z, levels=[0.5, 1, 2, 3, 4, 5])  # 等高線のプロット
plt.scatter(X_std[:, 0], X_std[:, 1], c=model3.predict(X_std))
plt.title('VBGMM(covariance_type=full)')

plt.show()

# 4 クラスタリングの予測 -------------------------------------------------------------------------

# 予測
model3.predict(X_std)

# 混合係数
# --- ウエイトなので合計1となる