def fit_mixtures(X,mag,mbins,binwidth=0.2,seed=None, keepscore=False,keepbic=False,**kwargs): kwargs.setdefault('n_components',25) kwargs.setdefault('covariance_type','full') fits = [] if keepscore: scores = [] if keepbic: bics = [] if seed: np.random.seed(seed) for bincenter in mbins: # this is not an efficient way to assign bins, but the time # is negligible compared to the GMM fitting anyway ii = np.where( np.abs(mag-bincenter) < binwidth )[0] if False: print('{:.2f}: {} qsos'.format(bincenter,len(ii))) gmm = GaussianMixture(**kwargs) gmm.fit(X[ii]) fits.append(gmm) if keepscore: scores.append(gmm.score(X[ii])) if keepbic: bics.append(gmm.bic(X[ii])) rv = (fits,) if keepscore: rv += (scores,) if keepbic: rv += (bics,) return rv
def loggausfit(self): self.fitDf['IRM_norm'] = self.fitDf['remanance']/self.fitDf['remanance'].max() xstd,distance,means,covras,weights,yfits = [],[],[],[],[],[] for i in range(10): data = self.rand_data() for j in range(20): gmm = GMM(self.fitNumber, covariance_type='full') model = gmm.fit(data) xstd.append(np.std(model.means_)) means.append(model.means_) covras.append(model.covariances_) weights.append(model.weights_) sample = self.fitDf['field'].values.reshape((-1, 1)) logprob = model.score_samples(sample) # M_best.eval(x) responsibilities = model.predict_proba(sample) pdf = np.exp(logprob) pdf_individual = responsibilities * pdf[:, np.newaxis] pdf_norm = np.sum(pdf_individual,axis=1)/np.max(np.sum(pdf_individual, axis=1)) #distance.append(np.max([abs(i-j) for i,j in zip(np.sum(pdf_individual,axis=1),p)])) distance.append(1 - spatial.distance.cosine(pdf_norm,self.fitDf['IRM_norm'])) yfits.append(pdf_individual) del data df = pd.DataFrame({'xstd':xstd, 'distance':distance, 'means':means, 'covras':covras, 'yfits':yfits, 'weights':weights}) df['cov_max'] = [np.min(i) for i in df['covras']] df = df.sort_values(by=['distance','cov_max','xstd'], ascending=[False,True,False]) pdf_best = df['yfits'].iloc[0] self.means = df['means'].iloc[0] self.covra = df['covras'].iloc[0]#sigma**2 self.weights = df['weights'].iloc[0] self.pdf_best = pdf_best/np.max(np.sum(pdf_best,axis=1))
def fit(self, data, ngauss, n_iter=5000, min_covar=1.0e-6, doplot=False, **keys): """ data is shape [npoints, ndim] """ from sklearn.mixture import GaussianMixture if len(data.shape) == 1: data = data[:,numpy.newaxis] print("ngauss: ",ngauss) print("n_iter: ",n_iter) print("min_covar:",min_covar) gmm=GaussianMixture( n_components=ngauss, max_iter=n_iter, reg_covar=min_covar, covariance_type='full', ) gmm.fit(data) if not gmm.converged_: print("DID NOT CONVERGE") self._gmm=gmm self.set_mixture(gmm.weights_, gmm.means_, gmm.covariances_) if doplot: plt=self.plot_components(data=data,**keys) return plt
def learn_subset(self, search_space): #Mask undesired features current_array = self.vectors[:,search_space] GM = GaussianMixture(n_components = 2, covariance_type = "full", tol = 0.001, reg_covar = 1e-06, max_iter = 1000, n_init = 25, init_params = "kmeans", weights_init = None, means_init = None, precisions_init = None, random_state = None, warm_start = False, verbose = 0, verbose_interval = 10 ) GM.fit(current_array) labels = GM.predict(current_array) unique, counts = np.unique(labels, return_counts = True) count_dict = dict(zip(unique, counts)) return count_dict, labels
def gmm(nclusters, coords, n_init=50, n_iter=500): if USE_GAUSSIAN_MIXTURE: est = GaussianMixture(n_components=nclusters, n_init=n_init, max_iter=n_iter) else: est = GMM(n_components=nclusters, n_init=n_init, n_iter=n_iter) est.fit(coords) return Partition(est.predict(coords))
def GaussianMixture(V, **kwargs): """Performs clustering on *V* by using Gaussian mixture models. The function uses :func:`sklearn.micture.GaussianMixture`. See sklearn documents for details. :arg V: row-normalized eigenvectors for the purpose of clustering. :type V: :class:`numpy.ndarray` :arg n_clusters: specifies the number of clusters. :type n_clusters: int """ try: from sklearn.mixture import GaussianMixture except ImportError: raise ImportError('Use of this function (GaussianMixture) requires the ' 'installation of sklearn.') n_components = kwargs.pop('n_components', None) if n_components == None: n_components = kwargs.pop('n_clusters',None) if n_components == None: n_components = 1 n_init = kwargs.pop('n_init', 1) mixture = GaussianMixture(n_init=n_init, n_components=n_components, **kwargs).fit(V) return mixture.fit_predict(V)
def create_random_gmm(n_mix, n_features, covariance_type, prng=0): prng = check_random_state(prng) g = GaussianMixture(n_mix, covariance_type=covariance_type) g.means_ = prng.randint(-20, 20, (n_mix, n_features)) g.covars_ = make_covar_matrix(covariance_type, n_mix, n_features) g.weights_ = normalized(prng.rand(n_mix)) return g
def gmm(k, X, run_times=5): gm = GMM(k, n_init=run_times, init_params='kmeans') #gm = GMM(k) gm.fit(X) zh = gm.predict(X) mu = gm.means_ cov = gm.covariances_ return zh, mu, cov
def fit_gmm(samples, ncomponents=2): """Given a numpy array of floating point samples, fit a gaussian mixture model.""" # assume samples is of shape (NSAMPLES,); unsqueeze to (NSAMPLES,1) and train a GMM: gmm = GaussianMixture(n_components=ncomponents) gmm.fit(samples.reshape(-1,1)) # return params of GMM in [(coeff, mu, sigma)] format: params = [(gmm.weights_[c], gmm.means_[c][0], gmm.covariances_[c][0][0]) for c in range(ncomponents)] return params
def gmm(k, X, run_times=10, init='kmeans'): """GMM from sklearn library. init = {'kmeans', 'random'}, run_times is the number of times the algorithm is gonna run with different initializations. """ gm = GMM(k, n_init=run_times, init_params=init) gm.fit(X) zh = gm.predict(X) return zh
def main(): X, Y = get_data(10000) print("Number of data points:", len(Y)) model = GaussianMixture(n_components=10) model.fit(X) M = model.means_ R = model.predict_proba(X) print("Purity:", purity(Y, R)) # max is 1, higher is better print("DBI:", DBI(X, M, R)) # lower is better
def fit_conditional_parameters(self, j): class_wise_scores = self.get_class_wise_scores(j) class_wise_parameters = dict() for label in self._labels: gmm = GaussianMixture(n_components=1) gmm.fit(class_wise_scores[label].reshape(-1, 1)) class_wise_parameters[label] = \ self.Gaussian(mu=gmm.means_.flatten()[0], std=np.sqrt(gmm.covariances_.flatten()[0])) return class_wise_parameters
class GaussianMixture1D(object): """ Simple class to work with 1D mixtures of Gaussians Parameters ---------- means : array_like means of component distributions (default = 0) sigmas : array_like standard deviations of component distributions (default = 1) weights : array_like weight of component distributions (default = 1) """ def __init__(self, means=0, sigmas=1, weights=1): data = np.array([t for t in np.broadcast(means, sigmas, weights)]) components = data.shape[0] self._gmm = GaussianMixture(components, covariance_type='spherical') self._gmm.means_ = data[:, :1] self._gmm.weights_ = data[:, 2] / data[:, 2].sum() self._gmm.covariances_ = data[:, 1] ** 2 self._gmm.precisions_cholesky_ = 1 / np.sqrt(self._gmm.covariances_) self._gmm.fit = None # disable fit method for safety def sample(self, size): """Random sample""" return self._gmm.sample(size) def pdf(self, x): """Compute probability distribution""" if x.ndim == 1: x = x[:, np.newaxis] logprob = self._gmm.score_samples(x) return np.exp(logprob) def pdf_individual(self, x): """Compute probability distribution of each component""" if x.ndim == 1: x = x[:, np.newaxis] logprob = self._gmm.score_samples(x) responsibilities = self._gmm.predict_proba(x) return responsibilities * np.exp(logprob[:, np.newaxis])
def finish(self): print("Calculating mean ToT for each PMT from gaussian fits...") gmm = GaussianMixture() xs, ys = [], [] for (dom_id, channel_id), tots in self.tot_data.iteritems(): dom = self.db.doms.via_dom_id(dom_id) gmm.fit(np.array(tots)[:, np.newaxis]).means_[0][0] mean_tot = gmm.means_[0][0] xs.append(31 * (dom.floor - 1) + channel_id + 600 * (dom.du - 1)) ys.append(mean_tot) fig, ax = plt.subplots() ax.scatter(xs, ys, marker="+") ax.set_xlabel("31$\cdot$(floor - 1) + channel_id + 600$\cdot$(DU - 1)") ax.set_ylabel("ToT [ns]") plt.title("Mean ToT per PMT") plt.savefig(self.plotfilename)
def fit(self, X, Y=None): if self.method == 'random': N = len(X) idx = np.random.randint(N, size=self.M) self.samples = X[idx] elif self.method == 'normal': # just sample from N(0,1) D = X.shape[1] self.samples = np.random.randn(self.M, D) / np.sqrt(D) elif self.method == 'kmeans': X, Y = self._subsample_data(X, Y) print("Fitting kmeans...") t0 = datetime.now() kmeans = KMeans(n_clusters=len(set(Y))) kmeans.fit(X) print("Finished fitting kmeans, duration:", datetime.now() - t0) # calculate the most ambiguous points # we will do this by finding the distance between each point # and all cluster centers # and return which points have the smallest variance dists = kmeans.transform(X) # returns an N x K matrix variances = dists.var(axis=1) idx = np.argsort(variances) # smallest to largest idx = idx[:self.M] self.samples = X[idx] elif self.method == 'gmm': X, Y = self._subsample_data(X, Y) print("Fitting GMM") t0 = datetime.now() gmm = GaussianMixture( n_components=len(set(Y)), covariance_type='spherical', reg_covar=1e-6) gmm.fit(X) print("Finished fitting GMM, duration:", datetime.now() - t0) # calculate the most ambiguous points probs = gmm.predict_proba(X) ent = stats.entropy(probs.T) # N-length vector of entropies idx = np.argsort(-ent) # negate since we want biggest first idx = idx[:self.M] self.samples = X[idx] return self
def finish(self): print("Calculating mean ToT for each PMT from gaussian fits...") gmm = GaussianMixture() xs, ys = [], [] df = pd.DataFrame(self.tot_data) for (dom_id, channel_id), data in df.groupby(['dom_id', 'channel_id']): tots = data['tot'] dom = self.db.doms.via_dom_id(dom_id) gmm.fit(tots[:, np.newaxis]).means_[0][0] mean_tot = gmm.means_[0][0] xs.append(31 * (dom.floor - 1) + channel_id + 600 * (dom.du - 1)) ys.append(mean_tot) fig, ax = plt.subplots() ax.scatter(xs, ys, marker="+") ax.set_xlabel("31$\cdot$(floor - 1) + channel_id + 600$\cdot$(DU - 1)") ax.set_ylabel("ToT [ns]") plt.title("Mean ToT per PMT") plt.savefig(self.plotfilename)
def Recognize(self, fn): im = Image.open(fn) im = util.CenterExtend(im, radius=20) vec = np.asarray(im.convert('L')).copy() Y = [] for i in range(vec.shape[0]): for j in range(vec.shape[1]): if vec[i][j] <= 200: Y.append([i, j]) gmm = GaussianMixture(n_components=7, covariance_type='tied', reg_covar=1e2, tol=1e3, n_init=9) gmm.fit(Y) centers = gmm.means_ points = [] for i in range(7): scoring = 0.0 for w_i in range(3): for w_j in range(3): p_x = centers[i][0] -1 +w_i p_y = centers[i][1] -1 +w_j cr = util.crop(im, p_x, p_y, radius=20) cr = cr.resize((40, 40), Image.ANTIALIAS) X = np.asarray(cr.convert('L'), dtype='float') X = (X.astype("float") - 180) /200 x0 = np.expand_dims(X, axis=0) x1 = np.expand_dims(x0, axis=3) global model if self.model.predict(x1)[0][0] < 0.5: scoring += 1 if scoring > 4: points.append((centers[i][0] -20, centers[i][1] -20)) return points
def __init__(self, means=0, sigmas=1, weights=1): data = np.array([t for t in np.broadcast(means, sigmas, weights)]) components = data.shape[0] self._gmm = GaussianMixture(components, covariance_type='spherical') self._gmm.means_ = data[:, :1] self._gmm.weights_ = data[:, 2] / data[:, 2].sum() self._gmm.covariances_ = data[:, 1] ** 2 self._gmm.precisions_cholesky_ = 1 / np.sqrt(self._gmm.covariances_) self._gmm.fit = None # disable fit method for safety
def fit(self, X_train, y_train): X_train = np.asarray(X_train) y_train = np.asarray(y_train) # from sklearn.mixture import GMM as GaussianMixture from sklearn.mixture import GaussianMixture unlabels = range(0, np.max(y_train) + 1) for lab in unlabels: if self.each_class_params is not None: # print 'eacl' # print self.each_class_params[lab] model = GaussianMixture(**self.each_class_params[lab]) # print 'po gmm ', model elif len(self.same_params) > 0: model = GaussianMixture(**self.same_params) # print 'ewe ', model else: model = GaussianMixture() X_train_lab = X_train[y_train == lab] # logger.debug('xtr lab shape ' + str(X_train_lab)) model.fit(X_train_lab) self.models.insert(lab, model)
y_pred # # exercise DBSCAN 1. run DBSCAN with different # ## GaussianMixture # #https://scikit-learn.org/stable/modules/generated/sklearn.mixture.GaussianMixture.html#examples-using-sklearn-mixture-gaussianmixture # In[71]: from sklearn.mixture import GaussianMixture gaussian_mixture = GaussianMixture(n_components=10).fit(x)#, covariance_type='full' y_pred = gaussian_mixture.predict(x) cluster_center = gaussian_mixture.means_ cluster_center.shape # In[72]: #https://jakevdp.github.io/PythonDataScienceHandbook/05.12-gaussian-mixtures.html aic_error = list() bic_error = list() start_ = 2
def find_gaussian_clusters(self): self.gaussian = GaussianMixture(n_components=4).fit(self.X)
class TorchDataGenerator(SyntheticDataGenerator): def __init__(self, verbose=True): self._implicit_coef = None self.model = None self.verbose = verbose self.items_view = None self.avg_ratings_per_user = None self.item_ids = None self.user_vectors = None self.item_vectors = None self.gmm = None def _build_mf(self, ds: Dataset, task: Task, components): task.logger.report_text("Training MF") u, s, v = sparsesvd(ds.rating_matrix, components) self.user_vectors = u.T self.item_vectors = np.dot(np.diag(s), v).T def _build_torch_model(self, ds: Dataset, task: Task, components): task.logger.report_text("Building torch model") model = MatrixFactorization(ds.n_users, ds.n_items, components) model.user_factors = torch.nn.Embedding(ds.n_users, components, _weight=FloatTensor( self.user_vectors)) model.item_factors = torch.nn.Embedding(ds.n_items, components, _weight=FloatTensor( self.item_vectors)) self.model = model def _sample_unrated_items(self, ds: Dataset, batch_u, batch_i, batch_r): return [np.random.randint(0, ds.n_items) for _ in range(len(batch_u))] def _construct_loss(self, ds: Dataset, batch_r, batch_u, batch_i, logloss_weight, return_loss_components): gt_rating = FloatTensor([batch_r]) u_tensor = LongTensor([batch_u]) i_tensor = LongTensor([batch_i]) batch_unrated_items = self._sample_unrated_items( ds, batch_u, batch_i, batch_r) batch_unrated_items_tensor = LongTensor(batch_unrated_items) rated_prediction, rated_items_logits = self.model(u_tensor, i_tensor) unrated_prediction, unrated_items_logits = self.model( u_tensor, batch_unrated_items_tensor) ones_tensor = torch.ones_like(rated_items_logits) zeroes_tensor = torch.zeros_like(unrated_items_logits) proba_loss = (((rated_items_logits - ones_tensor)**2).mean() / 2 + ((unrated_items_logits - zeroes_tensor)**2).mean() / 2) rating_loss = ((gt_rating - rated_prediction) * (gt_rating - rated_prediction)).mean() loss = rating_loss + proba_loss * logloss_weight loss_components = {} if return_loss_components: loss_components = { "proba": proba_loss.item(), "rating": rating_loss.item(), "total": loss.item(), } return loss, loss_components def _train_torch_model(self, ds: Dataset, task: Task, epochs, logloss_weight, lr, batch_size): task.logger.report_text("Training torch model") optimizer = torch.optim.Adam(self.model.parameters(), lr=lr) loss_components_list = [] batch_u, batch_i, batch_r = [], [], [] if self.items_view is None: m2 = dok_matrix(ds.rating_matrix) self.items_view = list(m2.items()) for e in range(epochs): shuffle(self.items_view) for j, ((u, i), r) in enumerate(tqdm(self.items_view)): if (len(batch_u) > 0) and (j % batch_size == 0): # or j == len(m2.items())): optimizer.zero_grad() loss, loss_components = self._construct_loss( ds, batch_r, batch_u, batch_i, logloss_weight, self.verbose) loss_components_list.append(loss_components) loss.backward() optimizer.step() batch_u = [] batch_i = [] batch_r = [] batch_u.append(u) batch_r.append(r) batch_i.append(i) if loss_components_list: average_metrics = { component: float(np.mean([x[component] for x in loss_components_list])) for component in loss_components_list[0] } if self.verbose: print( "Epoch ", e, "\t".join("%s: %f" % (k, v) for k, v in average_metrics.items())) for component in average_metrics: task.logger.report_scalar("Loss", component, average_metrics[component], e) def _build_gmm(self, task: Task, gmm_clusters): task.logger.report_text("Building GMM") self.gmm = GaussianMixture(gmm_clusters, verbose=2, verbose_interval=1) self.gmm.fit(self.user_vectors) def build(self, task: Task, base_dataset: Dataset, epochs=50, components=200, gmm_clusters=10, logloss_weight=2.0, lr=5e-3, batch_size=5000): task.set_user_properties( **to_clear_ml_params(locals(), ["task", "self"])) self.item_ids = list(range(len(base_dataset.id_to_item))) self.avg_ratings_per_user = base_dataset.n_ratings / base_dataset.n_users if self.user_vectors is None: self._build_mf(base_dataset, task, components) self._build_torch_model(base_dataset, task, components) self._train_torch_model(base_dataset, task, epochs, logloss_weight=logloss_weight, lr=lr, batch_size=batch_size) self._build_gmm(task, gmm_clusters) def _sample_users(self, n_users): user_vectors, _ = self.gmm.sample(n_users) ratings_per_user = np.random.exponential(self.avg_ratings_per_user, size=n_users) return user_vectors, ratings_per_user def _sample_user_items(self, dotproducts, n_items): # MAGIC HERE # We transform arbitrary scores trained with RMSE loss # into probabilities # we can do this in a bunch of ways # hence, sigmoid and * 10 here dotproducts = dotproducts * self._implicit_coef logits = 1.0 / (1e-7 + np.exp(-dotproducts)) logits = logits / np.sum(logits) sampled_items = np.random.choice(self.item_ids, n_items, False, p=logits) return sampled_items def _get_user_rating(self, user_vector, item): sampled_rating = user_vector.dot(self.item_vectors[item]) return sampled_rating def generate(self, task, n_users=None, use_actual_user_vectors=False, use_actual_item_choice=False, implicit_coef=15.0, **kwargs): task.set_user_properties( **to_clear_ml_params(locals(), ["task", "self"])) self._implicit_coef = implicit_coef if n_users is None: n_users = self.user_vectors.shape[0] user_vectors, ratings_per_user = self._sample_users(n_users) n_items = self.item_vectors.shape[0] rating_matrix = dok_matrix((n_users, n_items)) batches = max(1, int(n_users / 1000)) for batch_n, user_vectors_batch in enumerate( tqdm(np.array_split(user_vectors, batches))): user_index_base = batch_n * 1000 # Super-efficient batched matrix multiplication that exploits pytorch (=GPU) probas = self.model.user_choice_probas(user_vectors_batch) for user_index_offset in range(len(probas)): u = user_index_base + user_index_offset ratings_count = int(ratings_per_user[u]) ratings_count = min(ratings_count, n_items) v = user_vectors[user_index_offset, :] sampled_items = self._sample_user_items( probas[user_index_offset, :], ratings_count) for i in sampled_items: rating_matrix[u, i] = self._get_user_rating(v, i) rating_matrix[u, i] = np.minimum( 1.0, np.maximum(rating_matrix[u, i], -1.0)) rating_matrix = csc_matrix(rating_matrix) ds = Dataset(rating_matrix=rating_matrix) return ds
''' 高斯混合聚类 ''' print(__doc__) from sklearn.mixture import GaussianMixture from sklearn.datasets import load_iris from plot_function.cluster_plot import plot_cluster from sklearn.preprocessing import StandardScaler iris = load_iris() data = iris.data target = iris.target train_data = StandardScaler().fit_transform(data) gm = GaussianMixture(n_components=4) gm.fit(train_data) labels = gm.predict(train_data) plot_cluster(train_data, labels) # labels=gm.labels_ # center=gm.cluster_centers_ # print(gm.inertia_) # plot_cluster(data,labels)
plt.ylabel("Log likelihood") plt.legend(['lowest component likelihood']) plt.show() """ pca = FA(n_components=3) Z = pca.fit_transform(X) for k in ks: clust = KMeans(n_clusters=k).fit(Z) W = clust.predict(Z) ss[k - 1] = clust.inertia_ plt.plot(ks, ss) plt.title("Wine Quality - KM") plt.xlabel("# of clusters") plt.ylabel("Sum of Squares") plt.legend(["kmeans"]) plt.show() for k in ks: clust = GaussianMixture(n_components=k).fit(Z) W = clust.predict(Z) ll[k - 1] = clust.score(Z) plt.plot(ks, ll) plt.title("Wine Quality - EM") plt.xlabel("# of clusters") plt.ylabel("log of likelihood") plt.legend(["EM"]) plt.show()
def find_num_clusters(max_clusters): # SETUP # data = pd.read_csv('./data/scaled.csv') x = data.values range_n_clusters = range(2, max_clusters + 1) all_silhouette_scores = [] # CLUSTER ITERATION # for n_clusters in range_n_clusters: # The silhouette coefficient can range from -1, 1 but in this example all # lie within [-0.2, 1] plt.xlim([-0.2, 1]) # The (n_clusters+1)*10 is for inserting blank space between silhouette # plots of individual clusters, to demarcate them clearly. plt.ylim([0, len(x) + (n_clusters + 1) * 10]) # Initialize the clusterer with n_clusters value clusterer = GaussianMixture(n_clusters) cluster_labels = clusterer.fit_predict(x) # The silhouette_score gives the average value for all the samples. # This gives a perspective into the density and separation of the formed clusters silhouette_avg = silhouette_score(x, cluster_labels) all_silhouette_scores.append(silhouette_avg) print("For n_clusters =", n_clusters, "The average silhouette_score is :", silhouette_avg) # Compute the silhouette scores for each sample sample_silhouette_values = silhouette_samples(x, cluster_labels) y_lower = 10 for i in range(n_clusters): # Aggregate the silhouette scores for samples belonging to # cluster i, and sort them ith_cluster_silhouette_values = sample_silhouette_values[ cluster_labels == i] ith_cluster_silhouette_values.sort() size_cluster_i = ith_cluster_silhouette_values.shape[0] y_upper = y_lower + size_cluster_i color = cm.get_cmap("Spectral")(float(i) / n_clusters) plt.fill_betweenx(np.arange(y_lower, y_upper), 0, ith_cluster_silhouette_values, facecolor=color, edgecolor=color, alpha=0.7) # Label the silhouette plots with their cluster numbers at the middle plt.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i)) # Compute the new y_lower for next plot y_lower = y_upper + 10 # 10 for the 0 samples plt.title(("Silhouette analysis for GMM clustering with %d clusters" % n_clusters), fontsize=10, fontweight='bold') plt.xlabel("Silhouette coefficient values") plt.ylabel("Cluster label") # The vertical line for average silhouette score of all the values plt.axvline(x=silhouette_avg, color="red", linestyle="--") plt.yticks([]) # Clear the yaxis labels / ticks plt.xticks([-0.2, 0, 0.2, 0.4, 0.6, 0.8, 1]) plt.savefig(f'./graphs/silhouette_{n_clusters}_clusters.png') plt.show() plt.scatter(range_n_clusters, all_silhouette_scores) plt.plot(range_n_clusters, all_silhouette_scores) plt.xticks(range_n_clusters) plt.title("Silhouette Scores of Clusters") plt.xlabel("Number of Clusters") plt.ylabel("Silhouette Scores") plt.savefig(f'./graphs/silhouette_scores.png') plt.show()
data_df = pd.DataFrame(data) writer = pd.ExcelWriter(path) data_df.to_excel(writer, 'page_1', float_format='%.5f') # float_format 控制精度 writer.save() data_path = 'D:/ZLW_data/SAMM/flow_cut_eye' frame_list = read_img(data_path, 0) data_set = np.empty([0, 256*3], dtype=np.float32) for i in range(len(frame_list)): count_b, count_g, count_r = calc_bgr_count(frame_list[i]) m = bgr_count_form(count_b, count_g, count_r) m = np.array([m]) data_set = np.concatenate([data_set, m]) print(data_set.shape) gmm = GaussianMixture(n_components=2).fit(data_set) labels = gmm.predict(data_set) save_data_to_excel(labels, 'D:/guass/labels.xlsx') frame_list0 = [] frame_list1 = [] for i in range(labels.shape[0]): if labels[i] == 0: print(str(i) + '是标签1') frame_list0.append(i) for i in range(labels.shape[0]): if labels[i] == 1: print(str(i) + '是标签0') frame_list1.append(i) frame_list0 = np.transpose(np.array([frame_list0])) frame_list1 = np.transpose(np.array([frame_list1])) save_data_to_excel(frame_list0, 'D:/guass/frame00.xlsx')
def test_gaussian_mixture_attributes(): # test bad parameters rng = np.random.RandomState(0) X = rng.rand(10, 2) n_components_bad = 0 gmm = GaussianMixture(n_components=n_components_bad) assert_raise_message( ValueError, "Invalid value for 'n_components': %d " "Estimation requires at least one component" % n_components_bad, gmm.fit, X) # covariance_type should be in [spherical, diag, tied, full] covariance_type_bad = 'bad_covariance_type' gmm = GaussianMixture(covariance_type=covariance_type_bad) assert_raise_message( ValueError, "Invalid value for 'covariance_type': %s " "'covariance_type' should be in " "['spherical', 'tied', 'diag', 'full']" % covariance_type_bad, gmm.fit, X) tol_bad = -1 gmm = GaussianMixture(tol=tol_bad) assert_raise_message( ValueError, "Invalid value for 'tol': %.5f " "Tolerance used by the EM must be non-negative" % tol_bad, gmm.fit, X) reg_covar_bad = -1 gmm = GaussianMixture(reg_covar=reg_covar_bad) assert_raise_message( ValueError, "Invalid value for 'reg_covar': %.5f " "regularization on covariance must be " "non-negative" % reg_covar_bad, gmm.fit, X) max_iter_bad = 0 gmm = GaussianMixture(max_iter=max_iter_bad) assert_raise_message( ValueError, "Invalid value for 'max_iter': %d " "Estimation requires at least one iteration" % max_iter_bad, gmm.fit, X) n_init_bad = 0 gmm = GaussianMixture(n_init=n_init_bad) assert_raise_message( ValueError, "Invalid value for 'n_init': %d " "Estimation requires at least one run" % n_init_bad, gmm.fit, X) init_params_bad = 'bad_method' gmm = GaussianMixture(init_params=init_params_bad) assert_raise_message( ValueError, "Unimplemented initialization method '%s'" % init_params_bad, gmm.fit, X) # test good parameters n_components, tol, n_init, max_iter, reg_covar = 2, 1e-4, 3, 30, 1e-1 covariance_type, init_params = 'full', 'random' gmm = GaussianMixture(n_components=n_components, tol=tol, n_init=n_init, max_iter=max_iter, reg_covar=reg_covar, covariance_type=covariance_type, init_params=init_params).fit(X) assert gmm.n_components == n_components assert gmm.covariance_type == covariance_type assert gmm.tol == tol assert gmm.reg_covar == reg_covar assert gmm.max_iter == max_iter assert gmm.n_init == n_init assert gmm.init_params == init_params
### Step-1 ### oof = np.zeros(len(train)) pred = np.zeros(len(test)) for i in range(MAX_MAGIC_NO): print('.', end='') oof_i_list = [] pred_i_list = [] train_i = train[magic_tr == i][:, infomative_cols[i]] target_i = target[magic_tr == i] test_i = test[magic_te == i][:, infomative_cols[i]] for n in range(1, MAX_COMPONENTS): oof_i_n = np.zeros(len(train_i)) pred_i_n = np.zeros(len(test_i)) gmm0 = GaussianMixture(n_components=n, covariance_type='full', random_state=RANDOM_SEED) gmm1 = GaussianMixture(n_components=n, covariance_type='full', random_state=RANDOM_SEED) for trn_idx, val_idx in kfold.split(train_i, target_i): trn_train = train_i[trn_idx, :] trn_target = target_i[trn_idx] val_train = train_i[val_idx, :] gmm0.fit(trn_train[trn_target == 0]) gmm1.fit(trn_train[trn_target == 1]) oof_i_n[val_idx] = gmm1.score_samples( val_train) - gmm0.score_samples(val_train) pred_i_n += (gmm1.score_samples(test_i) - gmm0.score_samples(test_i)) / kfold.n_splits
GMMs will be trained separately on each classes TFIDF samples ''' TFIDF_class = [] for class_num in range(1, 16): TFIDF_class.append(samples_from_class(TFIDFsvd, class_num, labels)) ''' GMM training We train #classes = 15 GMMS to estimate the distribution of the features Each row of the TFIDFsummed is a feature vector on which we train a GMM ''' GMMS = [] for class_num in range(1, 16): # ATTENTION: indexes of TF go from 0 - 14 # whereas the class numbers go from 1 - 15 GMMS.append( GaussianMixture(n_components=gmm_components).fit( TFIDF_class[class_num - 1])) ''' Testing ''' test_labels = [] with open('data/final.test') as test_file: testsamples = test_file.readlines() num_of_test_data = 0 #count rows for line in testsamples: num_of_test_data += 1 testwords = line.split() test_labels.append(testwords[0]) test_file.closed ''' Find the term document matrix from the test data
x2 = np.random.multivariate_normal(mean=(-1, 10), cov=cov1, size=N2) x = np.vstack((x1, x2)) y = np.array([0] * N1 + [1] * N2) ''' spherical:圆形 diag:对角线 tied:方差一样 full:方差可以不一样 ''' types = ('spherical', 'diag', 'tied', 'full') err = np.empty(len(types)) bic = np.empty(len(types)) for i, type in enumerate(types): gmm = GaussianMixture(n_components=2, covariance_type=type, random_state=0) gmm.fit(x) err[i] = 1 - accuracy_rate(gmm.predict(x), y) bic[i] = gmm.bic(x) print('错误率:', err.ravel()) print('BIC:', bic.ravel()) # 画图 xpos = np.arange(4) ax = plt.axes() # -0.3~0 || 0.7~1 || 1.7~2 || 2.7~3 b1 = ax.bar(xpos - 0.3, err, width=0.3, color='#77E0A0') # 0~0.3 || 1~1.3 || 2~2.3 || 3~3.3 b2 = ax.twinx().bar(xpos, bic, width=0.3, color='#FF8080') plt.grid(True) bic_min, bic_max = expand(bic.min(), bic.max())
X = mat_data['X'] y = mat_data['y'].squeeze() attributeNames = [name[0] for name in mat_data['attributeNames'].squeeze()] classNames = [name[0][0] for name in mat_data['classNames']] #X_old = X #X = np.hstack([X,X]) N, M = X.shape C = len(classNames) # Number of clusters K = 10 cov_type = 'full' # type of covariance, you can try out 'diag' as well reps = 1 # number of fits with different initalizations, best result will be kept # Fit Gaussian mixture model gmm = GaussianMixture(n_components=K, covariance_type=cov_type, n_init=reps).fit(X) cls = gmm.predict(X) # extract cluster labels cds = gmm.means_ # extract cluster centroids (means of gaussians) covs = gmm.covariances_ # extract cluster shapes (covariances of gaussians) if cov_type.lower() == 'diag': new_covs = np.zeros([K, M, M]) count = 0 for elem in covs: temp_m = np.zeros([M, M]) new_covs[count] = np.diag(elem) count += 1
#### rh print 'getting intensity values for the mask ....' maskObj = nib.load( '{subjects_dir}/{subj}/mri/rh_choroid+ventricle_mask.nii.gz'.format( subjects_dir=subjects_dir, subj=subj)) mask = maskObj.get_data() mask_indices = np.where(mask) mask_indices_array = np.array(mask_indices) mask_T1_vals = T1[mask_indices] ## GMM X = np.reshape(mask_T1_vals, (-1, 1)) gmm = GaussianMixture(n_components=2, covariance_type='full').fit(X) gmmb = BayesianGaussianMixture(n_components=2, covariance_type='full').fit(X) save_segmentation(gmmb, 'rh_choroid_gmmb_mask.nii.gz') ## susan input_img = '{subjects_dir}/{subj}/mri/rh_choroid_gmmb_mask.nii.gz'.format( subjects_dir=subjects_dir, subj=subj) susan(input_img) ## read choroid_gmmb_mask_susan.nii.gz choroid_gmmb_mask = nib.load( '{subjects_dir}/{subj}/mri/rh_choroid_gmmb_mask.nii.gz'.format( subjects_dir=subjects_dir, subj=subj)) choroid_gmmb_mask_ = choroid_gmmb_mask.get_data() choroid_gmmb_susan = nib.load(
#PCA LABEL_DIM = 10 x = X_gen.reshape(N, 28 * 28).detach().numpy() from sklearn.decomposition import PCA from sklearn import metrics from sklearn.mixture import GaussianMixture x_pca = PCA(n_components=2).fit_transform(x) df_pca = pd.DataFrame( x_pca, columns=["principal component 1", "principal component 2"]) df_pca_labels = pd.concat( [df_pca, pd.DataFrame(np.array(Y_gen), columns=["labels"])], axis=1) gmm_pred_labels = GaussianMixture(n_components=LABEL_DIM, reg_covar=1e-5).fit_predict(x) df_pca_gmm_labels = pd.concat( [df_pca_labels, pd.DataFrame(gmm_pred_labels, columns=["gmm_labels"])], axis=1) from collections import Counter asgnd_gmm_labels = np.unique(np.array( df_pca_gmm_labels["gmm_labels"])).astype(int) corr_gmm_labels = [] for i in asgnd_gmm_labels: most_common = Counter(df_pca_gmm_labels[df_pca_gmm_labels["gmm_labels"] == i]["labels"]).most_common()[0][0]
no = DF[DF['xAttack'] == C].shape[0] #print(no,Clus[i].shape[0]) if (Maxi < no): Class = C Maxi = no impurity[i] = Maxi / DF.shape[0] purity[i].append(impurity[i]) print(impurity) # In[ ]: from sklearn.mixture import GaussianMixture # In[ ]: GMM = GaussianMixture(n_components=5).fit_predict(df1) clustering_method.append('GMM') # In[ ]: #print(GMM) # In[ ]: Inp['predict'] = np.array(GMM) + 1 # In[ ]: impurity = {} for i in range(1, 6): Maxi = 0
def visualize_clusters(n_clusters, dim): data = pd.read_csv('./data/scaled.csv') x = data.values clusterer = GaussianMixture(n_clusters) cluster_labels = clusterer.fit_predict(x) scaler = StandardScaler().fit(pd.read_csv('./data/cleaned.csv')) cluster_label_means = [] cluster_label_stds = [] for n in range(n_clusters): print( f'Number of Plays in Cluster {n + 1}: {len(x[cluster_labels == n])}' ) means = np.average(scaler.inverse_transform(x)[cluster_labels == n], axis=0).round(4) stds = np.std(scaler.inverse_transform(x)[cluster_labels == n], axis=0).round(4) cluster_label_means.append(means) cluster_label_stds.append(stds) DataFrame(cluster_label_means, columns=data.columns)\ .to_csv("./data/Cluster_Means.csv") DataFrame(cluster_label_stds, columns=data.columns)\ .to_csv("./data/Cluster_STDs.csv") # One dimension if dim == 1: tsne_1d = TSNE(n_components=1) pca_1d = PCA(n_components=1) tcs_1d = pd.DataFrame(tsne_1d.fit_transform(x)) pcs_1d = pd.DataFrame(pca_1d.fit_transform(x)) tcs_1d.columns = ["TC1_1d"] pcs_1d.columns = ["PC1_1d"] plot_x_tsne = pd.concat([data, tcs_1d], axis=1, join='inner') plot_x_pca = pd.concat([data, pcs_1d], axis=1, join='inner') plot_x_tsne["zero"] = 0 plot_x_pca["zero"] = 0 # Two dimensions elif dim == 2: tsne_2d = TSNE(n_components=2) pca_2d = PCA(n_components=2) tcs_2d = pd.DataFrame(tsne_2d.fit_transform(x)) pcs_2d = pd.DataFrame(pca_2d.fit_transform(x)) tcs_2d.columns = ["TC1_2d", "TC2_2d"] pcs_2d.columns = ["PC1_2d", "PC2_2d"] plot_x_tsne = pd.concat([data, tcs_2d], axis=1, join='inner') plot_x_pca = pd.concat([data, pcs_2d], axis=1, join='inner') # Three dimensions elif dim == 3: tsne_3d = TSNE(n_components=3) pca_3d = PCA(n_components=3) tcs_3d = pd.DataFrame(tsne_3d.fit_transform(x)) pcs_3d = pd.DataFrame(pca_3d.fit_transform(x)) tcs_3d.columns = ["TC1_3d", "TC2_3d", "TC3_3d"] pcs_3d.columns = ["PC1_3d", "PC2_3d", "PC3_3d"] plot_x_tsne = pd.concat([data, tcs_3d], axis=1, join='inner') plot_x_pca = pd.concat([data, pcs_3d], axis=1, join='inner') else: print("Invalid Dimension...") return graph_colors = [ 'red', 'blue', 'green', 'yellow', 'pink', 'purple', 'black', 'lightskyblue', 'orange', 'darkred', 'salmon', 'cyan', 'lime', 'slategray', 'teal', 'peru', 'orchid', 'crimson', 'thistle', 'lavender' ] make_visualization('T', dim, n_clusters, plot_x_tsne, cluster_labels, graph_colors) make_visualization('P', dim, n_clusters, plot_x_pca, cluster_labels, graph_colors)
yticklabels=digits.target_names) plt.xlabel('true label') plt.ylabel('predicted label') plt.show() # check for accuracy of the classification accuracy_score(y_test, labels) ########################################################################################## ########################### GMM model #################################################### ########################################################################################## data = X_train.data # np.random.seed(1) # Your code here gmm_model = GMM(n_components=10, covariance_type='full', random_state=1) gmm_model.fit(data) print(gmm_model.converged_) # Extract the means as well as the covariances # Your code here mns = gmm_model.means_ covs = gmm_model.covariances_ # Reshape the images im = mns.reshape(10, 8, 8) # Don't change this code # Figure size in inches fig = plt.figure(figsize=(8, 3))
of operators is for model *GaussianMixture*. """ import os from timeit import timeit import numpy as np import matplotlib.pyplot as plt from onnx.tools.net_drawer import GetPydotGraph, GetOpNodeProducer from onnxruntime import InferenceSession from sklearn.mixture import GaussianMixture from sklearn.datasets import load_iris from sklearn.model_selection import train_test_split from skl2onnx import to_onnx data = load_iris() X_train, X_test = train_test_split(data.data) model = GaussianMixture() model.fit(X_train) ################################### # Default conversion # ++++++++++++++++++ model_onnx = to_onnx(model, X_train[:1].astype(np.float32), options={id(model): { 'score_samples': True }}, target_opset=12) sess = InferenceSession(model_onnx.SerializeToString()) xt = X_test[:5].astype(np.float32)
def test_warm_start(seed): random_state = seed rng = np.random.RandomState(random_state) n_samples, n_features, n_components = 500, 2, 2 X = rng.rand(n_samples, n_features) # Assert the warm_start give the same result for the same number of iter g = GaussianMixture(n_components=n_components, n_init=1, max_iter=2, reg_covar=0, random_state=random_state, warm_start=False) h = GaussianMixture(n_components=n_components, n_init=1, max_iter=1, reg_covar=0, random_state=random_state, warm_start=True) g.fit(X) score1 = h.fit(X).score(X) score2 = h.fit(X).score(X) assert_almost_equal(g.weights_, h.weights_) assert_almost_equal(g.means_, h.means_) assert_almost_equal(g.precisions_, h.precisions_) assert score2 > score1 # Assert that by using warm_start we can converge to a good solution g = GaussianMixture(n_components=n_components, n_init=1, max_iter=5, reg_covar=0, random_state=random_state, warm_start=False, tol=1e-6) h = GaussianMixture(n_components=n_components, n_init=1, max_iter=5, reg_covar=0, random_state=random_state, warm_start=True, tol=1e-6) g.fit(X) assert not g.converged_ h.fit(X) # depending on the area_data there is large variability in the number of # refit necessary to converge due to the complete randomness of the # area_data for _ in range(1000): h.fit(X) if h.converged_: break assert h.converged_
def fit_GMM(n): model = GaussianMixture(n, covariance_type='full', random_state=0).fit(training_data) pickle.dump(model, open(outdirr+savename+str(n)+'.gmm', 'wb')) return
print("Auto-Encoder with GMM Clustering") k = 10 # Number of clusters print("Loading dataset...") ((x_train, y_train), (x_test, y_test)) = keras.datasets.fashion_mnist.load_data() x_train = np.reshape(x_train, (x_train.shape[0], 784)) x_train = x_train / 255.0 x_test = np.reshape(x_test, (x_test.shape[0], 784)) x_test = x_test / 255.0 # Use auto encoder to reduce dimensionality, returns compressed rep of x_train, x_test cx_train, cx_test = autoencode(x_train, x_test) # Perform GMM clustering print("Training GMM...") gmm = GaussianMixture(n_components=k) gmm.fit(cx_train) print("Clustering training data...") clusterAssmentTrain = gmm.predict(cx_train) print("Clustering test data...") clusterAssmentTest = gmm.predict(cx_test) print("Done!") # Compute Metrics print("Training Metrics:") evaluate_clusters(10, clusterAssmentTrain, y_train) print("Testing Metrics:") evaluate_clusters(10, clusterAssmentTest, y_test) plt.show()
from sklearn.mixture import GaussianMixture as GMM import matplotlib.pyplot as plt import numpy as np if __name__ == "__main__": X = getAdultX() y = getAdultY() tester = emtc.ExpectationMaximizationTestCluster(X, y, clusters=range(1, 11), plot=True, targetcluster=2, stats=True) tester.run() # plot clustering gmm = GMM(covariance_type='diag', n_components=3) model = gmm.fit(X) labels = model.predict(X) # View the results # Set the size of the plot plt.figure(figsize=(14, 7)) # Create a colormap colormap = np.array(['red', 'lime', 'black', 'blue', 'yellow']) x1 = X.iloc[:, 0] x2 = X.iloc[:, 1] plt.scatter(x=x1, y=x2, c=colormap[labels], s=40) plt.title('adult EM Classification')
class OWCK(GaussianProcess_extra): """The Optimal Weighted Cluster Kriging/Gaussian Process class This class inherited from GaussianProcess class in sklearn library Most of the parameters are contained in sklearn.gaussian_process. Please check the docstring of Gaussian Process parameters in sklearn. Only newly introduced parameters are documented below. Parameters ---------- n_cluster : int, optional The number of clusters, determines the number of the Gaussian Process model to build. It is the speed-up factor in OWCK. min_leaf_size : int, optional if min_leaf_size > 0, min_leaf_size is used to determine the number of clusters for the model tree clustering method. cluster_method : string, optional The clustering algorithm used to partition the data set. Built-in clustering algorithm are: 'k-mean', 'GMM', 'fuzzy-c-mean', 'random', 'tree' Note that GMM, fuzzy-c-mean are fuzzy clustering algorithms With these algorithms you can set the overlap you desire. Tree is a non-fuzzy algorithm using local models per leaf in a regression tree The tree algorithm is also able to update the model with new records overlap : float, optional The percentage of overlap when using a fuzzy cluster method. Each cluster will be of the same size. is_parallel : boolean, optional A boolean switching parallel model fitting on. If it is True, then all the underlying Gaussian Process model will be fitted in parallel, supported by MPI. Otherwise, all the models will be fitted sequentially. Attributes ---------- cluster_label : the cluster label of the training set after clustering clusterer : the clustering algorithm used. models : a list of (fitted) Gaussian Process models built on each cluster. References ---------- .. [SWKBE15] `Bas van Stein, Hao Wang, Wojtek Kowalczyk, Thomas Baeck and Michael Emmerich. Optimally Weighted Cluster Kriging for Big Data Regression. In 14th International Symposium, IDA 2015, pages 310-321, 2015` http://link.springer.com/chapter/10.1007%2F978-3-319-24465-5_27# """ def __init__(self, regr='constant', corr='squared_exponential', n_cluster=8, min_leaf_size=0, cluster_method='k-mean', overlap=0.0, beta0=None, storage_mode='full', verbose=False, theta0=0.1, thetaL=None, thetaU=None, sigma2=None, optimizer='BFGS', random_start=1, normalize=False, nugget=10. * MACHINE_EPSILON, random_state=None, nugget_estim=True, is_parallel=False): super(OWCK, self).__init__(regr=regr, corr=corr, beta0=beta0, verbose=verbose, theta0=theta0, thetaL=thetaL, thetaU=thetaU, sigma2=sigma2, optimizer=optimizer, random_start=random_start, normalize=normalize, nugget=nugget, nugget_estim=nugget_estim, random_state=random_state) self.empty_model = GaussianProcess_extra(regr=regr, corr=corr, beta0=beta0, verbose=verbose, theta0=theta0, thetaL=thetaL, thetaU=thetaU, sigma2=sigma2, optimizer=optimizer, random_start=random_start, normalize=normalize, nugget=nugget, nugget_estim=nugget_estim, random_state=random_state) self.n_cluster = n_cluster self.is_parallel = is_parallel self.verbose = verbose self.overlap = overlap #overlap for fuzzy clusters self.min_leaf_size = min_leaf_size self.regr_label = regr self.fitted = False if cluster_method not in [ 'k-mean', 'GMM', 'fuzzy-c-mean', 'random', 'tree' ]: raise Exception( '{} clustering is not supported!'.format(cluster_method)) else: self.cluster_method = cluster_method def __clustering(self, X, y=None): """ The clustering procedure of the Optimal Weighted Clustering Gaussian Process. This function should not be called externally """ self.sizeX = len(X) if self.cluster_method == 'k-mean': clusterer = KMeans(n_clusters=self.n_cluster) clusterer.fit(X) self.cluster_label = clusterer.labels_ self.clusterer = clusterer elif self.cluster_method == 'tree': if (self.min_leaf_size > 0): self.minsamples = self.min_leaf_size tree = IncrementalRegressionTree( min_samples_leaf=self.min_leaf_size) else: self.minsamples = int(len(X) / (self.n_cluster)) tree = IncrementalRegressionTree( min_samples_leaf=self.minsamples) tree.fit(X, y) labels = tree.apply(X) clusters = np.unique(labels) k = len(clusters) if self.verbose: print("leafs:", k) self.n_cluster = k self.leaf_labels = np.unique(labels) self.cluster_label = labels self.clusterer = tree elif self.cluster_method == 'random': r = self.n_sample % self.n_cluster m = (self.n_sample - r) / self.n_cluster self.cluster_label = array(range(self.n_cluster) * m + range(r)) self.clusterer = None shuffle(self.cluster_label) elif self.cluster_method == 'GMM': #GMM from sklearn self.clusterer = GaussianMixture(n_components=self.n_cluster, n_init=10) self.clusterer.fit(X) self.cluster_labels_proba = self.clusterer.predict_proba(X) self.cluster_label = self.clusterer.predict(X) elif self.cluster_method == 'fuzzy-c-mean': #Fuzzy C-means from sklearn cntr, u, u0, d, jm, p, fpc = fuzz.cluster.cmeans(X.T, self.n_cluster, 2, error=0.000005, maxiter=10000, init=None) self.clusterer = cntr #save the centers for cmeans_predict self.cluster_labels_proba = u.T self.cluster_labels_proba = np.array(self.cluster_labels_proba) self.cluster_label = np.argmax(u, axis=0) self.cluster_label = np.array(self.cluster_label) def __fit(self, X, y): """ The Optimal Weighted Cluster Gaussian Process model fitting method. Parameters ---------- X : double array_like An array with shape (n_samples, n_features) with the input at which observations were made. y : double array_like An array with shape (n_samples, ) or shape (n_samples, n_targets) with the observations of the output to be predicted. Returns ------- ocwk : self A fitted Cluster Gaussian Process model object awaiting data to perform predictions. """ self.n_sample, self.n_feature = X.shape if y.shape[0] != self.n_sample: raise Exception('Training input and target do not match!') # clustering self.__clustering(X, y) # model creation self.models = None if (self.cluster_method == 'tree'): self.models = [deepcopy(self) for i in self.leaf_labels] else: self.models = [deepcopy(self) for i in range(self.n_cluster)] for m in self.models: m.__class__ = GaussianProcess_extra self.X = X self.y = y # model fitting if self.is_parallel: # parallel model fitting #now using parralel threading # # prepare the training set for each GP model if (self.cluster_method == 'k-mean' or self.cluster_method == 'random'): idx = [self.cluster_label == i for i in range(self.n_cluster)] elif (self.cluster_method == 'tree'): idx = [ self.cluster_label == self.leaf_labels[i] for i in range(self.n_cluster) ] if (self.verbose): print "len cluster", len(idx) else: targetMemberSize = (len(self.X) / self.n_cluster) * (1.0 + self.overlap) idx = [] minindex = np.argmin(self.y) maxindex = np.argmax(self.y) for i in range(self.n_cluster): idx_temp = np.argsort( self.cluster_labels_proba[:, i])[-targetMemberSize:] if (minindex not in idx_temp): idx_temp = np.hstack((idx_temp, [minindex])) if (maxindex not in idx_temp): idx_temp = np.hstack((idx_temp, [maxindex])) idx.append(idx_temp) training = [(X[index, :], y[index]) for index in idx] training_set = itertools.izip(range(self.n_cluster), deepcopy(self.models), training) pool = Pool(self.n_cluster) models = pool.map(train_modelstar, training_set) pool.close() pool.join() self.models = models #print models # ''' raise Exception('Parallel mode has been disabled for now.') # spawning processes... #os.chdir('/home/wangronin') comm = MPI.COMM_SELF.Spawn(sys.executable, args=['-m', 'owck.OWCK_slave'], maxprocs=self.n_cluster) # prepare the training set for each GP model if (self.cluster_method=='k-mean' or self.cluster_method=='random'): idx = [self.cluster_label == i for i in range(self.n_cluster)] elif (self.cluster_method=='tree'): idx = [self.cluster_label == self.leaf_labels[i] for i in range(self.n_cluster)] if (verbose): print "len cluster",len(idx) else: targetMemberSize = (len(self.X) / self.n_cluster)*(1.0+self.overlap) idx = [] minindex = np.argmin(self.y) maxindex = np.argmax(self.y) for i in range(self.n_cluster): idx_temp = np.argsort(self.cluster_labels_proba[:,i])[-targetMemberSize:] if (minindex not in idx_temp): idx_temp = np.hstack((idx_temp,[minindex])) if (maxindex not in idx_temp): idx_temp = np.hstack((idx_temp,[maxindex])) idx.append(idx_temp) training_set = [(X[index, :], y[index]) for index in idx] # scatter the models and data comm.scatter([(k, training_set[k]) \ for k in range(self.n_cluster)], root=MPI.ROOT) comm.scatter(self.models, root=MPI.ROOT) # Synchronization while the slave process are performing # heavy computations... comm.Barrier() # Gether the fitted model from the childrenn process # Note that 'None' is only valid in master-slave working mode results = comm.gather(None, root=MPI.ROOT) # keep the fitted model align with their cluster fitted = DataFrame([[d['index'], d['model']] \ for d in results], columns=['index', 'model']) fitted.sort('index', ascending=[True], inplace=True) self.models[:] = fitted['model'] # free all slave processes comm.Disconnect() ''' else: # sequential model fitting # get min and max value indexes such that no cluster gets # only one value instances. # minindex = np.argmin(self.training_y) # maxindex = np.argmax(self.training_y) for i in range(self.n_cluster): if (self.cluster_method == 'k-mean' or self.cluster_method == 'random'): idx = self.cluster_label == i elif (self.cluster_method == 'tree'): idx = self.cluster_label == self.leaf_labels[i] else: targetMemberSize = (len(self.X) / self.n_cluster) * (1.0 + self.overlap) idx = [] minindex = np.argmin(self.y) maxindex = np.argmax(self.y) # TODO: fix line here idx = np.argsort( self.cluster_labels_proba[:, i])[-int(targetMemberSize):] if (minindex not in idx): idx = np.hstack((idx, [minindex])) if (maxindex not in idx): idx = np.hstack((idx, [maxindex])) model = self.models[i] # TODO: discuss this will introduce overlapping samples # idx[minindex] = True # idx[maxindex] = True # dirty fix so that low nugget errors will increase the # nugget till the model fits while True: try: # super is needed here to call the 'fit' function in the # parent class (GaussianProcess_extra) if (self.cluster_method == 'tree' and self.verbose): print 'leaf: ', self.leaf_labels[i] length_lb = 1e-10 length_ub = 1e2 X = self.X[idx, :] x_lb, x_ub = X.min(0), X.max(0) model.thetaL = length_ub**-2. / ( x_ub - x_lb)**2. * np.ones(self.n_feature) model.thetaU = length_lb**-2. / ( x_ub - x_lb)**2. * np.ones(self.n_feature) model.fit(self.X[idx, :], self.y[idx]) break except Exception as e: print e if self.verbose: print('Current nugget setting is too small!' +\ ' It will be tuned up automatically') #pdb.set_trace() model.noise_var *= 10 def gradient(self, x): """ Calculate the gradient of the posterior mean and variance """ check_is_fitted(self, 'X') x = np.atleast_2d(x) if self.cluster_method == 'tree': idx = self.clusterer.apply(x.reshape(1, -1))[0] active_GP_idx = np.nonzero(self.leaf_labels == idx)[0][0] active_GP = self.models[active_GP_idx] y_dx, mse_dx = active_GP.gradient(x) elif self.cluster_method == 'GMM': # TODO: implement this pass elif self.cluster_method in ['random', 'k-mean']: par = {} _ = self.predict(x, eval_MSE=False, par_out=par) weights = par['weights'].reshape(-1, 1) y = par['y'].reshape(-1, 1) mse = par['mse'].reshape(-1, 1) normalized_mse = par['mse_normalized'].reshape(-1, 1) U = par['U'].reshape(-1, 1) y_jac, mse_jac = zip(*[model.gradient(x) for model in self.models]) y_jac, mse_jac = np.c_[y_jac], np.c_[mse_jac] M = (1. / normalized_mse).sum() tmp = np.dot(mse_jac, (1. / normalized_mse**2.) / U) weights_jacobian = -mse_jac * normalized_mse.T ** -2. / U.T / M \ + (np.repeat(tmp, len(weights), axis=1) / normalized_mse.T) / M ** 2. y_dx = np.dot(y_jac, weights) + np.dot(weights_jacobian, y) mse_dx = np.dot(mse_jac, weights**2.) + np.dot( weights.T * weights_jacobian, mse) return y_dx, mse_dx def __mse_upper_bound(self, model): """ This function computes the tight upper bound of the Mean Square Error( Kriging variance) for the underlying Posterior Gaussian Process model, whose usage should be subject to Simple or Ordinary Kriging (constant trend) Parameters ---------- model : a fitted Gaussian Process/Kriging model, in which 'self.regr' should be 'constant' Returns ---------- upper_bound : the upper bound of the Mean Squared Error """ if self.regr_label != 'constant': raise Exception('MSE upper bound only exists for constant trend') C = model.C if C is None: # Light storage mode (need to recompute C, F, Ft and G) if model.verbose: print( "This GaussianProcess used 'light' storage mode " "at instantiation. Need to recompute " "autocorrelation matrix...") _, par = model.reduced_likelihood_function() model.C = par['C'] model.Ft = par['Ft'] model.G = par['G'] n_samples, n_features = model.X.shape tmp = 1 / model.G**2 upper_bound = np.sum(model.sigma2 * (1 + tmp)) return upper_bound def __check_duplicate(self, X, y): # TODO: show a warning here X = np.atleast_2d(X) new_X = [] new_Y = [] for i, x in enumerate(X): idx = np.nonzero(np.all(np.isclose(self.X, x), axis=1))[0] if len(idx) == 0: new_X.append(x) new_Y.append(y[i]) if y[i] != self.y[idx]: raise Exception( 'The same input can not have different respones!') return np.array(new_X), new_Y def updateModel(self, newX, newY): """ Deprecated function, just call fit with new database. """ newY = newY.reshape(-1, 1) #print newY.shape, self.y.shape X = np.r_[self.X, newX] y = np.r_[self.y, newY] self.fit(X, y) def update_data(self, X, y): self.X = X self.y = y # note that the clusters are not rebuilt if self.cluster_method == 'tree': self.cluster_label = self.clusterer.apply(self.X) for i, model in enumerate(self.models): idx = self.cluster_label == self.leaf_labels[i] if not np.any(idx): raise Exception('No data point in cluster {}!'.format(i + 1)) model.update_data(self.X[idx, :], self.y[idx]) else: # TODO: to implement for the rest options pass return self def fit(self, newX, newY, re_estimate_all=False): """ Add several instances to the data and rebuild models newX is a 2d array of (instances,features) and newY a vector """ if not hasattr(self, 'X'): self.__fit(newX, newY) return newX, newY = self.__check_duplicate(newX, newY) if self.cluster_method == 'tree': #first update our data if len(newY) != 0: self.X = np.r_[self.X, newX] self.y = np.r_[self.y, newY] #self.X = np.append(self.X, newX, axis=0) #self.y = np.append(self.y, newY) #check the size of the new data if re_estimate_all: #in this case build additional models if self.verbose: print("refitting all models") self.__fit(self.X, self.y) elif len(self.X) > (self.sizeX + self.minsamples * 2.0): #in this case build additional models if needed if self.verbose: print("refitting new models") #print("Current tree") #print(self.clusterer) rebuildmodels = np.unique(self.clusterer.apply(newX)) rebuildmodelstemp = [] rebuild_index = 0 self.cluster_label = self.clusterer.apply(self.X) new_leaf_labels = [] for i in rebuildmodels: leafindex = np.where(self.leaf_labels == i)[0][0] idx = self.cluster_label == i #check size of model if (len(idx) > self.minsamples * 2.0): if self.verbose: print("Trying to split leaf node", i) #split the leaf and fit 2 additional models new_labels = [] if self.clusterer.split_terminal( i, self.X[idx, :], self.y[idx]): self.cluster_label = self.clusterer.apply(self.X) new_labels = np.unique(self.cluster_label) self.n_cluster = len(new_labels) delete_old = False for n in new_labels: if n not in self.leaf_labels: delete_old = True new_leafindex = np.where(new_labels == n)[0][0] if self.verbose: print("New model with id", new_leafindex) #print self.leaf_labels new_model = deepcopy(self.empty_model) self.models.append(new_model) self.leaf_labels = np.append( self.leaf_labels, n) #rebuildmodelstemp.append(new_leafindex) new_leaf_labels.append(n) if delete_old: self.leaf_labels = np.delete( self.leaf_labels, leafindex) del (self.models[leafindex]) #if self.verbose: #print("New tree") #print(self.clusterer) #print self.leaf_labels else: #just refit this model #rebuildmodelstemp.append(leafindex) new_leaf_labels.append(i) for n in new_leaf_labels: rebuildmodelstemp.append( np.where(self.leaf_labels == n)[0][0]) rebuildmodels = np.unique( np.array(rebuildmodelstemp, dtype=int)) labels = self.clusterer.apply(self.X) self.cluster_label = labels self.leaf_labels = np.unique(labels) for i in rebuildmodels: idx = self.cluster_label == self.leaf_labels[i] if self.verbose: print("updating model on position " + str(i) + " attached to leaf id " + str(self.leaf_labels[i]) + " and " + str(sum(idx)) + " data points") model = self.models[i] while True: try: # super is needed here to call the 'fit' function in the # parent class (GaussianProcess) model.fit(self.X[idx, :], self.y[idx]) break except ValueError: if self.verbose: print('Current nugget setting is too small!' +\ ' It will be tuned up automatically') model.nugget *= 10 else: rebuildmodels = np.unique(self.clusterer.apply(newX)) rebuildmodelstemp = [] for i in rebuildmodels: rebuildmodelstemp.append( np.where(self.leaf_labels == i)[0][0]) rebuildmodels = np.array(rebuildmodelstemp, dtype=int) labels = self.clusterer.apply(self.X) self.cluster_label = labels if self.is_parallel: # parallel model fitting idx = [ self.cluster_label == self.leaf_labels[i] for i in rebuildmodels ] modelstosend = [ deepcopy(self.models[i]) for i in rebuildmodels ] training = [(self.X[index, :], self.y[index]) for index in idx] training_set = itertools.izip(rebuildmodels, modelstosend, training) pool = Pool(self.n_cluster) models = pool.map(train_modelstar, training_set) pool.close() pool.join() for i in range(len(rebuildmodels)): self.models[rebuildmodels[i]] = models[i] else: # is_parralel = false for i in rebuildmodels: if self.verbose: print("updating model " + str(i)) idx = self.cluster_label == self.leaf_labels[i] model = self.models[i] while True: try: # super is needed here to call the 'fit' function in the # parent class (GaussianProcess) model.fit(self.X[idx, :], self.y[idx]) break except ValueError: if self.verbose: print('Current nugget setting is too small!' +\ ' It will be tuned up automatically') model.nugget *= 10 else: #rebuild all models self.X = np.r_[self.X, newX] self.y = np.r_[self.y, newY] self.__fit(self.X, self.y) # TODO: implementating batch_size option to reduce the memory usage def predict(self, X, eval_MSE=False, par_out=None): """ This function evaluates the Optimal Weighted Gaussian Process model at x. Parameters ---------- X : array_like An array with shape (n_eval, n_features) giving the point(s) at which the prediction(s) should be made. eval_MSE : boolean, optional A boolean specifying whether the Mean Squared Error should be evaluated or not. Default assumes evalMSE = False and evaluates only the BLUP (mean prediction). batch_size : integer, Not available yet An integer giving the maximum number of points that can be evaluated simultaneously (depending on the available memory). Default is None so that all given points are evaluated at the same time. Returns ------- y : array_like, shape (n_samples, ) or (n_samples, n_targets) An array with shape (n_eval, ) if the Gaussian Process was trained on an array of shape (n_samples, ) or an array with shape (n_eval, n_targets) if the Gaussian Process was trained on an array of shape (n_samples, n_targets) with the Best Linear Unbiased Prediction at x. MSE : array_like, optional (if eval_MSE == True) An array with shape (n_eval, ) or (n_eval, n_targets) as with y, with the Mean Squared Error at x. """ X = np.atleast_2d(X) X = X.T if size(X, 1) != self.n_feature else X n_eval, n_feature = X.shape if n_feature != self.n_feature: raise Exception('Dimensionality does not match!') if self.cluster_method == 'tree': pred = np.zeros(n_eval) if eval_MSE: mse = np.zeros(n_eval) for i, x in enumerate(X): # modelindex = self.clusterer ix = self.clusterer.apply(x.reshape(1, -1)) model = self.models[np.where(self.leaf_labels == ix)[0][0]] _ = model.predict(x.reshape(1, -1), eval_MSE) if eval_MSE: pred[i], mse[i] = _ else: pred[i] = _ if eval_MSE: return pred, mse else: return pred elif self.cluster_method in ['random', 'k-mean']: # compute predictions and MSE from all underlying GP models # super is needed here to call the 'predict' function in the # parent class res = array([model.predict(X, eval_MSE=True) \ for model in self.models]) # compute the upper bound of MSE from all underlying GP models mse_upper_bound = array([self.__mse_upper_bound(model) \ for model in self.models]) if np.any(mse_upper_bound == 0): raise Exception('Something weird happened!') pred, mse = res[:, 0, :], res[:, 1, :] normalized_mse = mse / mse_upper_bound.reshape(-1, 1) # inverse of the MSE matrices Q_inv = [diag(1.0 / normalized_mse[:, i]) for i in range(n_eval)] _ones = ones(self.n_cluster) weight = lambda Q_inv: dot(_ones, Q_inv) normalizer = lambda Q_inv: dot(dot(_ones, Q_inv), _ones.reshape(-1, 1)) # compute the weights of convex combination weights = array( [weight(q_inv) / normalizer(q_inv) for q_inv in Q_inv]) # make sure the weights sum to 1... if np.any(abs(np.sum(weights, axis=1) - 1.0) > 1e-8): raise Exception('Computed weights do not sum to 1!') # convex combination of predictions from the underlying GP models pred_combined = array([inner(pred[:, i], weights[i, :]) \ for i in range(n_eval)]) if par_out is not None: par_out['weights'] = weights par_out['y'] = pred par_out['mse'] = mse par_out['mse_normalized'] = normalized_mse par_out['U'] = mse / normalized_mse # if overall MSE is needed if eval_MSE: mse_combined = array([inner(mse[:, i], weights[i, :]**2) \ for i in range(n_eval)]) return pred_combined, mse_combined else: return pred_combined elif self.cluster_method == 'GMM': # TODO: implement the MSE calculation for 'GMM' approach: mixed of Gaussian processes pass
class Scdv(BaseEstimator, ClusterMixin, TransformerMixin): """ implementation of https://dheeraj7596.github.io/SDV/""" # TODO accept args for idfvectorizer def __init__(self, word_emb_func=None, stop_words=None, sparse_threshold_p=0.04, n_components=50, covariance_type="full", tol=0.001, reg_covar=1e-06, max_iter=100, n_init=1, init_params="kmeans", weights_init=None, means_init=None, precisions_init=None, random_state=None, warm_start=False, verbose=0, verbose_interval=10): self.semantic_soft_clustering = GaussianMixture( n_components, covariance_type, tol, reg_covar, max_iter, n_init, init_params, weights_init, means_init, precisions_init, random_state, warm_start, verbose, verbose_interval) super(Scdv, self).__init__() self._tfidf_vectorizer = IdfStoredCountVectorizer( stop_words=stop_words, use_idf=True, smooth_idf=True) self.sparse_threshold_p = sparse_threshold_p self._sparse_threshold_ratio = None self.sparse_threshold = None self._word_emb_func = word_emb_func if word_emb_func else lambda x: x self._word_embs = {} self._word_topic_vecs = None def __getstate__(self): state = self.__dict__.copy() # Remove the unpicklable entries. del state['_word_emb_func'] return state def to_idf(self, index): word_index = index if isinstance(index, str): word_index = self._tfidf_vectorizer.vocabulary_[index] return self._tfidf_vectorizer.idf_[word_index] def fit(self, docs, y=None): logger.info("creating dictionary and computing idf...") self._tfidf_vectorizer.fit(docs) self._word_embs = OrderedDict({ index: self._word_emb_func(word) for word, index in sorted( self._tfidf_vectorizer.vocabulary_.items(), key=lambda key_value: key_value[1]) }) word_vecs = np.vstack(list(self._word_embs.values())) logger.info("clustering in-vocabulary words (size: %d) ...", len(word_vecs)) self.semantic_soft_clustering.fit(word_vecs) logger.info("getting word-topic_vectors...") self._word_topic_vecs = self._to_word_topic_vectors(word_vecs) logger.info("computing threshold to make sparse...") self._compute_sparse_threshold_ratio(self.transform_into_ncdv(docs)) logger.info("fitting has finished!!") return self def transform(self, raw_documents, should_compress=True): return self._make_sparse(self.transform_into_ncdv(raw_documents), should_compress) def fit_transform(self, raw_documents, y=None, **kwargs): return self.fit(raw_documents).transform(raw_documents) def _to_word_topic_vectors(self, word_vecs): semantic_cluster_probs = self.semantic_soft_clustering.predict_proba( word_vecs) # TODO replace with faster way. Can I use mode product? topic_vector_dim = (word_vecs.shape[0], semantic_cluster_probs.shape[1] * word_vecs.shape[1]) word_topic_vectors = np.zeros(topic_vector_dim) for i, word_index in zip(range(word_vecs.shape[0]), self._word_embs.keys()): word_vec = word_vecs[i] word_vec = word_vec.reshape((word_vec.shape[0], 1)) word_cluster_prob = semantic_cluster_probs[i] word_cluster_prob = word_cluster_prob.reshape( (1, word_cluster_prob.shape[0])) word_topic_vectors[i, :] = (np.dot(word_vec, word_cluster_prob) * self.to_idf(word_index)).reshape( word_topic_vectors.shape[1:]) return word_topic_vectors def _compute_sparse_threshold_ratio(self, non_sparse_doc_vectors): feature_min_value = np.mean(np.min(non_sparse_doc_vectors, axis=0)) feature_max_value = np.mean(np.max(non_sparse_doc_vectors, axis=0)) self._sparse_threshold_ratio = (np.abs(feature_max_value) + np.abs(feature_min_value)) / 2 self.sparse_threshold = self.sparse_threshold_p * self._sparse_threshold_ratio def transform_into_ncdv(self, raw_documents): bag_of_words_list = self._tfidf_vectorizer.transform(raw_documents) doc_vectors = bag_of_words_list.dot(self._word_topic_vecs) return doc_vectors def _make_sparse(self, matrix, should_compress=True): matrix[np.where(np.abs(should_compress) < self.sparse_threshold)] = 0 if should_compress: return sparse.csr_matrix(matrix) return matrix
# (25%) sets. skf = StratifiedKFold(n_folds=4) # Only take the first fold. train_index, test_index = next(iter(skf.split(iris.data, iris.target))) X_train = iris.data[train_index] y_train = iris.target[train_index] X_test = iris.data[test_index] y_test = iris.target[test_index] n_classes = len(np.unique(y_train)) # Try GMMs using different types of covariances. estimators = dict((cov_type, GaussianMixture(n_components=n_classes, covariance_type=cov_type, max_iter=20, random_state=0)) for cov_type in ['spherical', 'diag', 'tied', 'full']) n_estimators = len(estimators) plt.figure(figsize=(3 * n_estimators // 2, 6)) plt.subplots_adjust(bottom=.01, top=0.95, hspace=.15, wspace=.05, left=.01, right=.99) for index, (name, estimator) in enumerate(estimators.items()): # Since we have class labels for the training data, we can
def gmm_information_criteria_report( X_mat, k=np.arange(1, 20), covar_type=['full', 'tied', 'diag', 'spherical'], random_seed=11238, out="Graph"): # Dataframe transposing closure type funct tmp_global_aic, tmp_global_bic = [], [] for i in covar_type: tmp_iter_aic, tmp_iter_bic = [], [] for j in k: tmp_model = GaussianMixture(j, covariance_type=i, random_state=random_seed).fit(X_mat) tmp_iter_aic.append(tmp_model.aic(X_mat)) tmp_iter_bic.append(tmp_model.bic(X_mat)) tmp_global_aic.append(tmp_iter_aic) tmp_global_bic.append(tmp_iter_bic) covar_type = covar_type tmp_get_aic = handle_df(tmp_global_aic, covar_type) tmp_get_bic = handle_df(tmp_global_bic, covar_type) tmp_get_aic_max = pd.melt(tmp_get_aic, id_vars=['n_components'], value_vars=covar_type).sort_values(by='value') tmp_get_bic_max = pd.melt(tmp_get_bic, id_vars=['n_components'], value_vars=covar_type).sort_values(by='value') tmp_top_aic = tmp_get_aic_max.head(3) tmp_top_bic = tmp_get_bic_max.head(3) if out is "Graph": plt.subplot(2, 1, 1) for colname, index in tmp_get_aic.drop( columns='n_components').iteritems(): plt.plot(index, label=colname) plt.scatter(tmp_top_aic['n_components'], tmp_top_aic['value'], edgecolors='slategrey', facecolor='none', lw=2, label="Best hyperparams") plt.title('Akaike Information Criteria') plt.xticks(k - 1, k) plt.xlabel('Number of clusters estimated') plt.ylabel('AIC') plt.legend() plt.subplot(2, 1, 2) for colname, index, in tmp_get_bic.drop( columns='n_components').iteritems(): plt.plot(index, label=colname) plt.scatter(tmp_top_bic['n_components'], tmp_top_bic['value'], edgecolors='slategrey', facecolor='none', lw=2, label="Best hyperparams") plt.title('Bayesian Information Criteria') plt.xticks(k - 1, k) plt.xlabel('Number of clusters estimated') plt.ylabel('BIC') plt.legend() elif out is not "Graph": return tmp_get_aic_max, tmp_get_bic_max
print(X[0]) print(y) # 20개의 종류로 나옵니다. ## # GMM( Gaussian Mixture model ) 확률 분포를 사용해서 타원으로 클러스터링 # kmeans : 원형, import numpy as np import matplotlib.pylab as plt from sklearn.datasets.samples_generator import make_blobs X, y_true = make_blobs(n_samples=400, centers=4, cluster_std=0.60, random_state=0) X = X[:, ::-1] from sklearn.mixture import GaussianMixture gmm = GaussianMixture (n_components=4).fit(X) labels = gmm.predict(X) plt.scatter(X[:, 0], X[:, 1], c=labels, s=40, cmap='viridis'); probs = gmm.predict_proba(X) print(probs[:5].round(3)) plt.show() ##가우시안이 믹스쳐 되어있는 데이터이다. 타원으로. from matplotlib.patches import Ellipse def draw_ellipse(position, covariance, ax=None, **kwargs): ax = ax or plt.gca() if covariance.shape == (2, 2): U, s, Vt = np.linalg.svd(covariance) angle = np.degrees(np.arctan2(U[1, 0], U[0, 0])) width, height = 2 * np.sqrt(s)
def fit(self, X, y=None): """ Fits gaussian mixure model to the data. Estimate model parameters with the EM algorithm. Parameters ---------- X : array-like, shape (n_samples, n_features) List of n_features-dimensional data points. Each row corresponds to a single data point. y : array-like, shape (n_samples,), optional (default=None) List of labels for X if available. Used to compute ARI scores. Returns ------- self """ # Deal with number of clusters if self.max_components is None: lower_ncomponents = 1 upper_ncomponents = self.min_components else: lower_ncomponents = self.min_components upper_ncomponents = self.max_components n_mixture_components = upper_ncomponents - lower_ncomponents + 1 if upper_ncomponents > X.shape[0]: if self.max_components is None: msg = "if max_components is None then min_components must be >= " msg += "n_samples, but min_components = {}, n_samples = {}".format( upper_ncomponents, X.shape[0]) else: msg = "max_components must be >= n_samples, but max_components = " msg += "{}, n_samples = {}".format(upper_ncomponents, X.shape[0]) raise ValueError(msg) elif lower_ncomponents > X.shape[0]: msg = "min_components must be <= n_samples, but min_components = " msg += "{}, n_samples = {}".format(upper_ncomponents, X.shape[0]) raise ValueError(msg) # Get parameters random_state = self.random_state param_grid = dict( covariance_type=self.covariance_type, n_components=range(lower_ncomponents, upper_ncomponents + 1), random_state=[random_state], ) param_grid = list(ParameterGrid(param_grid)) models = [[] for _ in range(n_mixture_components)] bics = [[] for _ in range(n_mixture_components)] aris = [[] for _ in range(n_mixture_components)] for i, params in enumerate(param_grid): model = GaussianMixture(**params) model.fit(X) models[i % n_mixture_components].append(model) bics[i % n_mixture_components].append(model.bic(X)) if y is not None: predictions = model.predict(X) aris[i % n_mixture_components].append( adjusted_rand_score(y, predictions)) self.bic_ = pd.DataFrame( bics, index=np.arange(lower_ncomponents, upper_ncomponents + 1), columns=self.covariance_type, ) if y is not None: self.ari_ = pd.DataFrame( aris, index=np.arange(lower_ncomponents, upper_ncomponents + 1), columns=self.covariance_type, ) else: self.ari_ = None # Get the best cov type and its index within the dataframe best_covariance = self.bic_.min(axis=0).idxmin() best_covariance_idx = self.covariance_type.index(best_covariance) # Get the index best component for best_covariance best_component = self.bic_.idxmin()[best_covariance] self.n_components_ = best_component self.covariance_type_ = best_covariance self.model_ = models[best_component - self.min_components][best_covariance_idx] return self
def n_random_integers(n, low=0, high=10): ''' generate random numbers with random.randint''' ii = [] for i in range(n): ii.append(random.randint(low, high)) return np.array(ii) if __name__=='__main__': data = load_from_pickle('data','daily_array_all.pkl') # shape: (1440, 48) # cluster as gaussian mixture X = data n = 10 gmm = GaussianMixture(n_components=n) gmm.fit(X) y = gmm.predict(X) probs = gmm.predict_proba(X) # sort results into clusters based on labels def clusters_from_lables(X,y): labels = np.unique(y) clusters = [] for label in labels: clusters.append(X[np.where(y == label)]) return clusters clusters = clusters_from_lables(X,y) cluster_sizes = [x.shape[0] for x in clusters]
# find cities that are prime numbers prime_cities = eratosthenes(max(cities.CityId)) cities['prime'] = prime_cities b=len(cities) num=random.sample(range(1,b ), 156) num.append(0) df=cities.iloc[num] df = df.reset_index(drop=True) len(np.unique(df)) plt.scatter(df.X, df.Y, c='blue',alpha=0.5,s=1) #------------------ clustering -------------------------- from sklearn.mixture import GaussianMixture n_cluster=4 mclusterer = GaussianMixture(n_components=n_cluster, tol=0.01, random_state=66, verbose=1).fit(df[['X', 'Y']].values) df['mclust'] = mclusterer.predict(df[['X', 'Y']].values) centers = df.groupby('mclust')['X', 'Y'].agg('mean').reset_index() clust_c=['#630C3A', '#39C8C6', '#D3500C', '#FFB139'] colors = np.where(df["mclust"]%4==0,'#630C3A','-') colors[df['mclust']%4==1] = '#39C8C6' colors[df['mclust']%4==2] = '#D3500C' colors[df['mclust']%4==3] = '#FFB139' plt.figure(figsize=(8, 5)) plt.scatter(df.X, df.Y, color=colors,alpha=0.5,s=5) for i in range(n_cluster): plt.scatter(centers.iloc[i].X, centers.iloc[i].Y, c='black', s=50) #plt.scatter(zeros[0],zeros[1],c='green',s=50) plt.show()