def fit_mixtures(X,mag,mbins,binwidth=0.2,seed=None, keepscore=False,keepbic=False,**kwargs): kwargs.setdefault('n_components',25) kwargs.setdefault('covariance_type','full') fits = [] if keepscore: scores = [] if keepbic: bics = [] if seed: np.random.seed(seed) for bincenter in mbins: # this is not an efficient way to assign bins, but the time # is negligible compared to the GMM fitting anyway ii = np.where( np.abs(mag-bincenter) < binwidth )[0] if False: print('{:.2f}: {} qsos'.format(bincenter,len(ii))) gmm = GaussianMixture(**kwargs) gmm.fit(X[ii]) fits.append(gmm) if keepscore: scores.append(gmm.score(X[ii])) if keepbic: bics.append(gmm.bic(X[ii])) rv = (fits,) if keepscore: rv += (scores,) if keepbic: rv += (bics,) return rv
def fit(self, data, ngauss, n_iter=5000, min_covar=1.0e-6, doplot=False, **keys): """ data is shape [npoints, ndim] """ from sklearn.mixture import GaussianMixture if len(data.shape) == 1: data = data[:,numpy.newaxis] print("ngauss: ",ngauss) print("n_iter: ",n_iter) print("min_covar:",min_covar) gmm=GaussianMixture( n_components=ngauss, max_iter=n_iter, reg_covar=min_covar, covariance_type='full', ) gmm.fit(data) if not gmm.converged_: print("DID NOT CONVERGE") self._gmm=gmm self.set_mixture(gmm.weights_, gmm.means_, gmm.covariances_) if doplot: plt=self.plot_components(data=data,**keys) return plt
def learn_subset(self, search_space): #Mask undesired features current_array = self.vectors[:,search_space] GM = GaussianMixture(n_components = 2, covariance_type = "full", tol = 0.001, reg_covar = 1e-06, max_iter = 1000, n_init = 25, init_params = "kmeans", weights_init = None, means_init = None, precisions_init = None, random_state = None, warm_start = False, verbose = 0, verbose_interval = 10 ) GM.fit(current_array) labels = GM.predict(current_array) unique, counts = np.unique(labels, return_counts = True) count_dict = dict(zip(unique, counts)) return count_dict, labels
def gmm(nclusters, coords, n_init=50, n_iter=500): if USE_GAUSSIAN_MIXTURE: est = GaussianMixture(n_components=nclusters, n_init=n_init, max_iter=n_iter) else: est = GMM(n_components=nclusters, n_init=n_init, n_iter=n_iter) est.fit(coords) return Partition(est.predict(coords))
def fit_gmm(samples, ncomponents=2): """Given a numpy array of floating point samples, fit a gaussian mixture model.""" # assume samples is of shape (NSAMPLES,); unsqueeze to (NSAMPLES,1) and train a GMM: gmm = GaussianMixture(n_components=ncomponents) gmm.fit(samples.reshape(-1,1)) # return params of GMM in [(coeff, mu, sigma)] format: params = [(gmm.weights_[c], gmm.means_[c][0], gmm.covariances_[c][0][0]) for c in range(ncomponents)] return params
def gmm(k, X, run_times=5): gm = GMM(k, n_init=run_times, init_params='kmeans') #gm = GMM(k) gm.fit(X) zh = gm.predict(X) mu = gm.means_ cov = gm.covariances_ return zh, mu, cov
def gmm(k, X, run_times=10, init='kmeans'): """GMM from sklearn library. init = {'kmeans', 'random'}, run_times is the number of times the algorithm is gonna run with different initializations. """ gm = GMM(k, n_init=run_times, init_params=init) gm.fit(X) zh = gm.predict(X) return zh
def main(): X, Y = get_data(10000) print("Number of data points:", len(Y)) model = GaussianMixture(n_components=10) model.fit(X) M = model.means_ R = model.predict_proba(X) print("Purity:", purity(Y, R)) # max is 1, higher is better print("DBI:", DBI(X, M, R)) # lower is better
def fit_conditional_parameters(self, j): class_wise_scores = self.get_class_wise_scores(j) class_wise_parameters = dict() for label in self._labels: gmm = GaussianMixture(n_components=1) gmm.fit(class_wise_scores[label].reshape(-1, 1)) class_wise_parameters[label] = \ self.Gaussian(mu=gmm.means_.flatten()[0], std=np.sqrt(gmm.covariances_.flatten()[0])) return class_wise_parameters
def loggausfit(self): self.fitDf['IRM_norm'] = self.fitDf['remanance']/self.fitDf['remanance'].max() xstd,distance,means,covras,weights,yfits = [],[],[],[],[],[] for i in range(10): data = self.rand_data() for j in range(20): gmm = GMM(self.fitNumber, covariance_type='full') model = gmm.fit(data) xstd.append(np.std(model.means_)) means.append(model.means_) covras.append(model.covariances_) weights.append(model.weights_) sample = self.fitDf['field'].values.reshape((-1, 1)) logprob = model.score_samples(sample) # M_best.eval(x) responsibilities = model.predict_proba(sample) pdf = np.exp(logprob) pdf_individual = responsibilities * pdf[:, np.newaxis] pdf_norm = np.sum(pdf_individual,axis=1)/np.max(np.sum(pdf_individual, axis=1)) #distance.append(np.max([abs(i-j) for i,j in zip(np.sum(pdf_individual,axis=1),p)])) distance.append(1 - spatial.distance.cosine(pdf_norm,self.fitDf['IRM_norm'])) yfits.append(pdf_individual) del data df = pd.DataFrame({'xstd':xstd, 'distance':distance, 'means':means, 'covras':covras, 'yfits':yfits, 'weights':weights}) df['cov_max'] = [np.min(i) for i in df['covras']] df = df.sort_values(by=['distance','cov_max','xstd'], ascending=[False,True,False]) pdf_best = df['yfits'].iloc[0] self.means = df['means'].iloc[0] self.covra = df['covras'].iloc[0]#sigma**2 self.weights = df['weights'].iloc[0] self.pdf_best = pdf_best/np.max(np.sum(pdf_best,axis=1))
def finish(self): print("Calculating mean ToT for each PMT from gaussian fits...") gmm = GaussianMixture() xs, ys = [], [] for (dom_id, channel_id), tots in self.tot_data.iteritems(): dom = self.db.doms.via_dom_id(dom_id) gmm.fit(np.array(tots)[:, np.newaxis]).means_[0][0] mean_tot = gmm.means_[0][0] xs.append(31 * (dom.floor - 1) + channel_id + 600 * (dom.du - 1)) ys.append(mean_tot) fig, ax = plt.subplots() ax.scatter(xs, ys, marker="+") ax.set_xlabel("31$\cdot$(floor - 1) + channel_id + 600$\cdot$(DU - 1)") ax.set_ylabel("ToT [ns]") plt.title("Mean ToT per PMT") plt.savefig(self.plotfilename)
def main(): Xtrain, Ytrain, Xtest, Ytest = getKaggleMNIST() dae = DeepAutoEncoder([500, 300, 2]) dae.fit(Xtrain) mapping = dae.map2center(Xtrain) plt.scatter(mapping[:,0], mapping[:,1], c=Ytrain, s=100, alpha=0.5) plt.show() # purity measure from unsupervised machine learning pt 1 gmm = GaussianMixture(n_components=10) gmm.fit(Xtrain) responsibilities_full = gmm.predict_proba(Xtrain) print "full purity:", purity(Ytrain, responsibilities_full) gmm.fit(mapping) responsibilities_reduced = gmm.predict_proba(mapping) print "reduced purity:", purity(Ytrain, responsibilities_reduced)
def fit(self, X, Y=None): if self.method == 'random': N = len(X) idx = np.random.randint(N, size=self.M) self.samples = X[idx] elif self.method == 'normal': # just sample from N(0,1) D = X.shape[1] self.samples = np.random.randn(self.M, D) / np.sqrt(D) elif self.method == 'kmeans': X, Y = self._subsample_data(X, Y) print("Fitting kmeans...") t0 = datetime.now() kmeans = KMeans(n_clusters=len(set(Y))) kmeans.fit(X) print("Finished fitting kmeans, duration:", datetime.now() - t0) # calculate the most ambiguous points # we will do this by finding the distance between each point # and all cluster centers # and return which points have the smallest variance dists = kmeans.transform(X) # returns an N x K matrix variances = dists.var(axis=1) idx = np.argsort(variances) # smallest to largest idx = idx[:self.M] self.samples = X[idx] elif self.method == 'gmm': X, Y = self._subsample_data(X, Y) print("Fitting GMM") t0 = datetime.now() gmm = GaussianMixture( n_components=len(set(Y)), covariance_type='spherical', reg_covar=1e-6) gmm.fit(X) print("Finished fitting GMM, duration:", datetime.now() - t0) # calculate the most ambiguous points probs = gmm.predict_proba(X) ent = stats.entropy(probs.T) # N-length vector of entropies idx = np.argsort(-ent) # negate since we want biggest first idx = idx[:self.M] self.samples = X[idx] return self
def finish(self): print("Calculating mean ToT for each PMT from gaussian fits...") gmm = GaussianMixture() xs, ys = [], [] df = pd.DataFrame(self.tot_data) for (dom_id, channel_id), data in df.groupby(['dom_id', 'channel_id']): tots = data['tot'] dom = self.db.doms.via_dom_id(dom_id) gmm.fit(tots[:, np.newaxis]).means_[0][0] mean_tot = gmm.means_[0][0] xs.append(31 * (dom.floor - 1) + channel_id + 600 * (dom.du - 1)) ys.append(mean_tot) fig, ax = plt.subplots() ax.scatter(xs, ys, marker="+") ax.set_xlabel("31$\cdot$(floor - 1) + channel_id + 600$\cdot$(DU - 1)") ax.set_ylabel("ToT [ns]") plt.title("Mean ToT per PMT") plt.savefig(self.plotfilename)
def main(): Xtrain, Ytrain, Xtest, Ytest = getKaggleMNIST() dae = DeepAutoEncoder([500, 300, 2]) dae.fit(Xtrain) mapping = dae.map2center(Xtrain) plt.scatter(mapping[:,0], mapping[:,1], c=Ytrain, s=100, alpha=0.5) plt.show() # purity measure from unsupervised machine learning pt 1 # NOTE: this will take a long time (i.e. just leave it overnight) gmm = GaussianMixture(n_components=10) gmm.fit(Xtrain) print("Finished GMM training") responsibilities_full = gmm.predict_proba(Xtrain) print("full purity:", purity(Ytrain, responsibilities_full)) gmm.fit(mapping) responsibilities_reduced = gmm.predict_proba(mapping) print("reduced purity:", purity(Ytrain, responsibilities_reduced))
def Recognize(self, fn): im = Image.open(fn) im = util.CenterExtend(im, radius=20) vec = np.asarray(im.convert('L')).copy() Y = [] for i in range(vec.shape[0]): for j in range(vec.shape[1]): if vec[i][j] <= 200: Y.append([i, j]) gmm = GaussianMixture(n_components=7, covariance_type='tied', reg_covar=1e2, tol=1e3, n_init=9) gmm.fit(Y) centers = gmm.means_ points = [] for i in range(7): scoring = 0.0 for w_i in range(3): for w_j in range(3): p_x = centers[i][0] -1 +w_i p_y = centers[i][1] -1 +w_j cr = util.crop(im, p_x, p_y, radius=20) cr = cr.resize((40, 40), Image.ANTIALIAS) X = np.asarray(cr.convert('L'), dtype='float') X = (X.astype("float") - 180) /200 x0 = np.expand_dims(X, axis=0) x1 = np.expand_dims(x0, axis=3) global model if self.model.predict(x1)[0][0] < 0.5: scoring += 1 if scoring > 4: points.append((centers[i][0] -20, centers[i][1] -20)) return points
def fit(self, X_train, y_train): X_train = np.asarray(X_train) y_train = np.asarray(y_train) # from sklearn.mixture import GMM as GaussianMixture from sklearn.mixture import GaussianMixture unlabels = range(0, np.max(y_train) + 1) for lab in unlabels: if self.each_class_params is not None: # print 'eacl' # print self.each_class_params[lab] model = GaussianMixture(**self.each_class_params[lab]) # print 'po gmm ', model elif len(self.same_params) > 0: model = GaussianMixture(**self.same_params) # print 'ewe ', model else: model = GaussianMixture() X_train_lab = X_train[y_train == lab] # logger.debug('xtr lab shape ' + str(X_train_lab)) model.fit(X_train_lab) self.models.insert(lab, model)
conditions = behavioral['labels'] fmri_masked = masker.fit_transform(fmri_filename) fmri_train, fmri_test, conditions_train, conditions_test = train_test_split( fmri_masked, conditions, test_size=0.2, random_state=0) svc = SVC(kernel='linear') svc.fit(fmri_train, conditions_train) svm_prediction = svc.predict(fmri_test) svm_accuracy = accuracy_score(conditions_test, svm_prediction) print(svm_accuracy) gnb = GaussianNB() gnb.fit(fmri_train, conditions_train) gnb_prediction = gnb.predict(fmri_test) gnb_accuracy = accuracy_score(conditions_test, gnb_prediction) print(gnb_accuracy) kneigh = KNeighborsClassifier(n_neighbors=3) kneigh.fit(fmri_train, conditions_train) kneigh_prediction = kneigh.predict(fmri_test) kneigh_accuracy = accuracy_score(conditions_test, kneigh_prediction) print(kneigh_accuracy) gmm = GaussianMixture(n_components=3, covariance_type='spherical', max_iter=10) gmm.fit(fmri_train, conditions_train) gmm_prediction = gmm.predict(fmri_test) gmm_accuracy = accuracy_score(conditions_test, gmm_prediction) print(gmm_accuracy)
# 绘图使用 colors = '#A0FFA0', '#2090E0', '#FF8080' cm = mpl.colors.ListedColormap(colors) x1_min, x1_max = x[:, 0].min(), x[:, 0].max() x2_min, x2_max = x[:, 1].min(), x[:, 1].max() x1_min, x1_max = expand(x1_min, x1_max) x2_min, x2_max = expand(x2_min, x2_max) x1, x2 = np.mgrid[x1_min:x1_max:500j, x2_min:x2_max:500j] grid_test = np.stack((x1.flat, x2.flat), axis=1) plt.figure(figsize=(6, 6), facecolor='w') plt.suptitle('GMM/DPGMM比较', fontsize=15) ax = plt.subplot(211) gmm = GaussianMixture(n_components=n_components, covariance_type='full', random_state=0) gmm.fit(x) centers = gmm.means_ covs = gmm.covariances_ print('GMM均值 = \n', centers) print('GMM方差 = \n', covs) y_hat = gmm.predict(x) grid_hat = gmm.predict(grid_test) grid_hat = grid_hat.reshape(x1.shape) plt.pcolormesh(x1, x2, grid_hat, cmap=cm) plt.scatter(x[:, 0], x[:, 1], s=20, c=y, cmap=cm, marker='o', edgecolors='#202020') clrs = list('rgbmy') for i, (center, cov) in enumerate(zip(centers, covs)): value, vector = sp.linalg.eigh(cov) width, height = value[0], value[1]
def main(): target_dir = "gmm" train_data_file = "data/ext/train_data.npy" train_labels_file = "data/ext/train_labels.npy" test_data_file = "data/ext/test_data.npy" test_labels_file = "data/ext/test_labels.npy" #Not used atm but could train several different GMMs estimators = dict((cov_type, GaussianMixture(n_components=30, covariance_type=cov_type, max_iter=200, random_state=0)) for cov_type in ['spherical', 'diag', 'tied', 'full']) train_data = np.load(train_data_file) train_labels = np.load(train_labels_file) test_data = np.load(test_data_file) test_labels = np.load(test_labels_file) """ Fitting of the GMMs """ #number of different speakers n_classes = len(np.unique(test_labels)) gmm = GaussianMixture(n_components=1, tol=1e-3, max_iter=200, n_init=1, verbose=1) gmms = [] for i in range(0, n_classes): speaker_train_data = train_data[train_labels == i] gmm.fit(speaker_train_data) joblib.dump(gmm, f'{target_dir}/gmm_{i}.pkl') for i in range(0, n_classes): gmm = joblib.load(f'{target_dir}/gmm_{i}.pkl') gmms.append(gmm) """ Predict using the GMMs """ metadata_filepath = "data/ext/metadata.json" test_file_dir = "data/test" test_file_names = os.listdir(test_file_dir) #load metadata json with open(metadata_filepath, 'r') as f: metadata = json.load(f) labels = [] preds = [] #Make prediction per file in test_file_dir for file_name in test_file_names: parts = file_name.split('_') #Get speaker from filename if (len(parts) != 2): #data without deltas has 2 parts continue data = np.load(f'{test_file_dir}/{file_name}') testscores = np.zeros((len(data), n_classes)) #Score each sample in a file with all GMMs for i in range(0, n_classes): testscores[:, i] = gmms[i].score_samples(data) #Predict label(highest scoring GMM index) for each sample predictions = np.sum(testscores, axis=0) #Majority vote between predictions for the file prediction = predictions.argmax() #Gather predictions and correct labels for accuracy score preds.append(prediction) label = metadata['LABELS'][parts[0]] #Get label matching speaker labels.append(label) print(f'pred:{prediction}, label:{label}') #Print accuracy score print(accuracy_score(labels, preds))
def fit(self, X, y=None): """ Fits gaussian mixure model to the data. Estimate model parameters with the EM algorithm. Parameters ---------- X : array-like, shape (n_samples, n_features) List of n_features-dimensional data points. Each row corresponds to a single data point. y : array-like, shape (n_samples,), optional (default=None) List of labels for X if available. Used to compute ARI scores. Returns ------- self """ # Deal with number of clusters if self.max_components is None: lower_ncomponents = 1 upper_ncomponents = self.min_components else: lower_ncomponents = self.min_components upper_ncomponents = self.max_components n_mixture_components = upper_ncomponents - lower_ncomponents + 1 if upper_ncomponents > X.shape[0]: if self.max_components is None: msg = "if max_components is None then min_components must be >= " msg += "n_samples, but min_components = {}, n_samples = {}".format( upper_ncomponents, X.shape[0]) else: msg = "max_components must be >= n_samples, but max_components = " msg += "{}, n_samples = {}".format(upper_ncomponents, X.shape[0]) raise ValueError(msg) elif lower_ncomponents > X.shape[0]: msg = "min_components must be <= n_samples, but min_components = " msg += "{}, n_samples = {}".format(upper_ncomponents, X.shape[0]) raise ValueError(msg) # Get parameters random_state = self.random_state param_grid = dict( covariance_type=self.covariance_type, n_components=range(lower_ncomponents, upper_ncomponents + 1), tol=[self.tol], reg_covar=[self.reg_covar], max_iter=[self.max_iter], n_init=[self.n_init], init_params=[self.init_params], random_state=[random_state], ) param_grid = list(ParameterGrid(param_grid)) models = [[] for _ in range(n_mixture_components)] bics = [[] for _ in range(n_mixture_components)] aris = [[] for _ in range(n_mixture_components)] for i, params in enumerate(param_grid): model = GaussianMixture(**params) model.fit(X) models[i % n_mixture_components].append(model) bics[i % n_mixture_components].append(model.bic(X)) if y is not None: predictions = model.predict(X) aris[i % n_mixture_components].append( adjusted_rand_score(y, predictions)) self.bic_ = pd.DataFrame( bics, index=np.arange(lower_ncomponents, upper_ncomponents + 1), columns=self.covariance_type, ) if y is not None: self.ari_ = pd.DataFrame( aris, index=np.arange(lower_ncomponents, upper_ncomponents + 1), columns=self.covariance_type, ) else: self.ari_ = None # Get the best cov type and its index within the dataframe best_covariance = self.bic_.min(axis=0).idxmin() best_covariance_idx = self.covariance_type.index(best_covariance) # Get the index best component for best_covariance best_component = self.bic_.idxmin()[best_covariance] self.n_components_ = best_component self.covariance_type_ = best_covariance self.model_ = models[best_component - self.min_components][best_covariance_idx] return self
def plot_cluster_means( data_loader, transformer_path, dataset, output_dir, file_prefix, kmeans_clusters=2, em_clusters=2, ): if dataset == "intention": X_untransformed = load_intention() else: X_untransformed = load_pulsar() Xtransformed, y = data_loader() with open(transformer_path, "rb") as f: transformer = pickle.load(f) X = get_inverse_transform(transformer, Xtransformed) X = pd.DataFrame(X, columns=X_untransformed.columns) if data_loader is load_intention: X_plot = X[ [ "Administrative", "Administrative_Duration", "Informational", "Informational_Duration", "ProductRelated", "ProductRelated_Duration", "BounceRates", "ExitRates", "PageValues", ] ] else: X_plot = X fig, (ax1, ax2) = plt.subplots(1, 2) kmeans = KMeans(kmeans_clusters, random_state=1) em = GaussianMixture(n_components=em_clusters, random_state=1) kmeans.fit(X) em.fit(X) kmeans_df = pd.DataFrame(kmeans.cluster_centers_, columns=X.columns) kmeans_df = kmeans_df[X_plot.columns] em_df = pd.DataFrame(em.means_, columns=X.columns) em_df = em_df[X_plot.columns] kmeans_df.plot(kind="bar", ax=ax1) em_df.plot(kind="bar", ax=ax2) ax1.set_ylabel("Mean Value") ax2.set_ylabel("Mean Value") ax1.set_xlabel("Cluster") ax2.set_xlabel("Cluster") ax1.set_title("K-Means Cluster Centers") ax2.set_title("EM Cluster Centers") ax1.get_legend().remove() plt.savefig(os.path.join(output_dir, f"{file_prefix}_clusterprojections.png")) plt.close()
def gaussian_overlap(w1, w2): ''' estimate cluster overlap from a 2-mean Gaussian mixture model Description ------- Estimates the overlap between 2 spike clusters by fitting with two multivariate Gaussians. Implementation makes use of scikit learn 'GMM'. The percent of false positive and false negative errors are estimated for both classes and stored as a confusion matrix. Error rates are calculated by integrating the posterior probability of a misclassification. The integral is then normalized by the number of events in the cluster of interest. See description of confusion matrix below. NOTE: The dimensionality of the data set is reduced to the top 99% of principal components to increase the time efficiency of the fitting algorithm. Parameters -------- w1 : array-like [Event x Sample ] waveforms of 1st cluster w2 : array-like [Event x Sample ] waveforms of 2nd cluster Returns ------ C a confusion matrix C[0,0] - False positive fraction in cluster 1 (waveforms of neuron 2 that were assigned to neuron 1) C[0,1] - False negative fraction in cluster 1 (waveforms of neuron 1 that were assigned to neuron 2) C[1,0] - False negative fraction in cluster 2 C[1,1] - False positive fraction in cluster 2 ''' # reduce dimensionality to 98% of top Principal Components N1 = w1.shape[0] N2 = w2.shape[0] X = np.concatenate((w1, w2)) pca = PCA() pca.fit(X) Xn = pca.transform(X) cutoff = 0.98 num_dims = (np.cumsum(pca.explained_variance_ratio_) < cutoff).sum() w1 = Xn[:N1, :num_dims] w2 = Xn[N1:, :num_dims] # fit 2 multivariate gaussians gmm = GMM(n_components=2) gmm.fit(np.vstack((w1, w2))) # get posteriors pr1 = gmm.predict_proba(w1) pr2 = gmm.predict_proba(w2) # in the unlikely case that the cluster identities were flipped during the # fitting procedure, flip them back if pr1[:, 0].mean() + pr2[:, 1].mean() < 1: pr1 = pr1[:, [1, 0]] pr2 = pr2[:, [1, 0]] # create confusion matrix confusion = np.zeros((2, 2)) confusion[0, 0] = pr1[:, 1].mean() # probability that a member of 1 is false # relative proportion of spikes that were placed in cluster 2 by mistake confusion[0, 1] = pr2[:, 0].sum() / N1 confusion[1, 1] = pr2[:, 0].mean() # probability that a member of 2 was really from 1 # relative proportion of spikes that were placed in cluster 1 by mistake confusion[1, 0] = pr1[:, 1].sum() / N2 return confusion
plot_digits(digits.data) #PCA from sklearn.decomposition import PCA pca = PCA(0.99, whiten=True) data = pca.fit_transform(digits.data) data.shape #use AIC: n_components = np.arange(50, 310, 10) models = [GaussianMixture(n, covariance_type='full', random_state=0) for n in n_components] aics = [model.fit(data).aic(data) for model in models] plt.plot(n_components, aics); gmm = GaussianMixture(140, covariance_type='full', random_state=0) gmm.fit(data) print(gmm.converged_) #draw new data data_new = gmm.sample(100) data_new[0].shape #inverse transform from the PCA digits_new = pca.inverse_transform(data_new[0]) plot_digits(digits_new) #Such a generative model of digits can prove very useful as a component of a Bayesian generative classifier #Kernel density estimation (KDE) is in some senses an algorithm that takes the mixture-of-Gaussians idea to its logical extreme: it uses a mixture consisting of one Gaussian component per point, resulting in an essentially nonparametric estimator of density #For one-dimensional data, you are probably already familiar with one simple density estimator: the histogram
#loading data-set for EM algorithm iris = datasets.load_iris() X = pd.DataFrame(iris.data) Y = pd.DataFrame(iris.target) #Defining EM Model from sklearn.mixture import GaussianMixture model2=GaussianMixture(n_components=3,random_state=3425) #Training of the model model2.fit(X) #Predicting classes for our data uu= model2.predict(X) #Accuracy of EM Model from sklearn.metrics import confusion_matrix cmem=confusion_matrix(Y,uu) print('The Confusion matrixof EM-algo:\n',cmem) #print(cm) print('\n') from sklearn.metrics import accuracy_score
n_neighbors=4, eigen_solver='arpack', n_jobs=1) result_sc = sc.fit_predict(tfidf.toarray()) #DBSCAN算法 db = DBSCAN(eps=0.7, min_samples=1) result_db = db.fit_predict(tfidf.toarray()) #AgglomerativeClustering算法 ac = AgglomerativeClustering(n_clusters=89, affinity='euclidean', linkage='ward') result_ac = ac.fit_predict(tfidf.toarray()) #GaussianMixture算法 gm = GaussianMixture(n_components=89, covariance_type='diag', max_iter=20, random_state=0) #for cov_type in ['spherical', 'diag', 'tied', 'full'] gm.fit(tfidf.toarray()) result_gm = gm.predict(tfidf.toarray()) print('K-means的准确率:', normalized_mutual_info_score(result_kmeans, label_list)) print('AffinityPropagation算法的准确率:', normalized_mutual_info_score(result_ap, label_list)) print('meanshift算法的准确率:', normalized_mutual_info_score(result_ms, label_list)) print('SpectralClustering算法的准确率:', normalized_mutual_info_score(result_sc, label_list)) print('DBSCAN算法的准确率:', normalized_mutual_info_score(result_db, label_list)) print('AgglomerativeClustering算法的准确率:', normalized_mutual_info_score(result_ac, label_list)) print('GaussianMixture算法的准确率:', normalized_mutual_info_score(result_gm, label_list))
# In[7]: #clustering neighbourhood lat_long = train_test[train_test.longitude > -74.05][ train_test.longitude < -73.875][train_test.latitude > 40.63][ train_test.latitude < 40.87] cluster = lat_long[['latitude', 'longitude']] model_gm = GaussianMixture(n_components=40, covariance_type='full', tol=0.01, max_iter=5000, random_state=7, verbose=0) pred_gm = pd.DataFrame(model_gm.fit(cluster).predict(cluster)).set_index( cluster.index) pred_gm.columns = ['pred_gm'] train_test = pd.merge(train_test, pred_gm, how='left', left_index=True, right_index=True) train_test.pred_gm[train_test.pred_gm.isnull()] = -1 dummy_neighbourhood = pd.get_dummies(train_test.pred_gm, prefix='dummy_nb_') train_test = train_test.merge(dummy_neighbourhood, how='left', left_index=True,
CDBS_data.append(count) Al_data = ndimage.rotate(np.array(CDBS_data, dtype=float), -45, reshape=False) chn_num, photons, max_point = find_sudo_peak(Al_data, width=100) print("CDBS data imported") print(time.time()-start, "sec") ####################################################################################################### """Setting ROI""" ####################################################################################################### # hist, bin_edges = np.histogram(Al_data, bins=60) # bin_centers = 0.5*(bin_edges[:-1] + bin_edges[1:]) classif = GaussianMixture(n_components=7) classif.fit(Al_data.reshape((Al_data.size, 1))) print("Gaussian Mixture finished") print(time.time()-start, "sec") means_ = np.sort(np.squeeze(classif.means_)) threshold = means_[2]+2000 binary_img = Al_data > threshold masked_Al = np.ma.masked_less_equal(Al_data, threshold) mask_x = np.any(binary_img, axis=0) mask_y = np.any(binary_img, axis=1) x1 = np.argmax(mask_x) y1 = np.argmax(mask_y) x2 = len(mask_x) - np.argmax(mask_x[::-1]) y2 = len(mask_y) - np.argmax(mask_y[::-1])
class TwoStageClustering: """ Class to make a two-stage clustering model, where the first stage is a SOM network and the second stage is a clustering method such as k-means or GMM. """ def __init__(self, X, W=None, map_shape=(8, 8), n_clusters=10, init_lr=0.1, init_response=1, max_iter_SOM=10000, max_iter_clus=5000, clus_method="kmeans", normalize_data=False, seed=0): # data and SOM map shape self.X = X if normalize_data: self.X = minmax_scale(self.X, axis=0) # column-wise (self.N, self.d) = np.shape(X) self.map_shape = map_shape self.M = map_shape[0] * map_shape[1] # number of nodes in the network self.W = W # the weights of the output map # hyperparameters self.max_iter_SOM = max_iter_SOM self.max_iter_clus = max_iter_clus self.seed = seed self.n_clusters = n_clusters self.init_lr = init_lr self.init_response = init_response # first stage model self.model_SOM = SOM(X=self.X, map_shape=self.map_shape, init_lr=self.init_lr, init_response=self.init_response, max_iter=self.max_iter_SOM, seed=self.seed) # second stage model self.clus_method = clus_method if self.clus_method == "kmeans": self.model_clus = KMeans(n_clusters=self.n_clusters, random_state=self.seed, algorithm="full", max_iter=self.max_iter_clus, n_init=10) else: self.model_clus = GaussianMixture(n_components=self.n_clusters, max_iter=self.max_iter_clus, n_init=10, init_params="random") def train(self, print_progress=True): """ First trains the SOM network, then the second stage model with the prototypes from the SOM network. """ # training first stage SOM network t0 = time() # starting time training SOM if self.W is not None: self.model_SOM.map = self.W print( "The SOM is already trained! Continuing with the clusterig method..." ) else: print( "Start training the two stage clustering procedure with %s..." % self.clus_method) self.model_SOM.train(print_progress=print_progress) self.W = self.model_SOM.map # 3D array containing the M prototypes # fitting second stage clustering method t1 = time() # starting time second stage clustering method print("Training %s clustering method..." % self.clus_method) self.model_clus.fit(self.W.reshape( (self.M, self.d))) # reshape to a (M, d) matrix print( "%s clustering method with %d iterations finished in %.3f seconds" % (self.clus_method, self.max_iter_clus, time() - t1)) print("The two stage clustering procedure with %s took %.3f" % (self.clus_method, time() - t0)) def predict(self, X): """ Predicts the labels of X with the two stage clustering procedure. First, get the corresponding prototype of each sample of X, then predict the label of the prototype with the clustering method. :param X: the data sample to be predicted :return: the predicted labels """ W, indices, _ = self.model_SOM.predict(X) labels = self.model_clus.predict(W) return labels.astype(int) def save(self, file_name=None): """ Method to save the model as a pickle file """ if file_name == None: print("No file name is given!!!!") return dir_name = "Models/TwoStageClustering/" make_dir(dir_name) filehandler = open(dir_name + file_name + ".pkl", "wb") pkl.dump(self, filehandler) filehandler.close()
clus_KMeans = cluster.KMeans(n_clusters=7, random_state=161227) clus_KMeans.fit(X) predicted_label_KMeans = clus_KMeans.fit_predict(X) clus_AgglomerativeClustering = cluster.AgglomerativeClustering(n_clusters=6) clus_AgglomerativeClustering.fit(X) predicted_label_AgglomerativeClustering = clus_AgglomerativeClustering.fit_predict( X) clus_MiniBatchKMeans = cluster.MiniBatchKMeans(n_clusters=5, random_state=161227) clus_MiniBatchKMeans.fit(X) predicted_label_MiniBatchKMeans = clus_MiniBatchKMeans.fit_predict(X) clus_GM = GaussianMixture(n_components=5, random_state=161227) clus_GM.fit(X) predicted_label_GM = clus_GM.fit_predict(X) cluster_class = pd.DataFrame({ 'Station': station_name, 'x': station_x, 'y': station_y, 'KMeans': predicted_label_KMeans, 'AgglomerativeClustering': predicted_label_AgglomerativeClustering, 'MiniBatchKMeans': predicted_label_MiniBatchKMeans, 'GM': predicted_label_GM }) cluster_class.to_csv('Cluster_Results.csv', index=False) # plot_data(settle, predicted_label_KMeans)
ax.set_yticks(np.arange(0, 1., 0.1)) plt.grid() plt.scatter(X[:, 0], X[:, 1], color='yellow') plt.xlim((0, 1)) plt.ylim((0, 1)) plt.xlabel('Longitude') plt.ylabel('Latitude') plt.title('Fortnite Loot Box Locations') plt.savefig("rawdata.png") # Use a gaussian mixture model to identify clusters of boxes from sklearn.mixture import GaussianMixture RANDOM_STATE = 66 N_CLUSTERS = 28 gmm = GaussianMixture(n_components=N_CLUSTERS, random_state=RANDOM_STATE) gmm.fit(X) predict = gmm.predict(X) means = gmm.means_ # Plot the clusters centers that were found and overlay them plt.scatter(means[:, 0], means[:, 1], color='blue') plt.savefig("clusters.png") # Remove any clusters that are not within a 30 second run of a cluster center from scipy.spatial.distance import cdist TIME_SECONDS = 30 UNITS_PER_SECOND = 4.34 # Calculated empirically units_per_second_scaled = UNITS_PER_SECOND / MAX_DIMENSION radius = units_per_second_scaled * TIME_SECONDS center_points = np.zeros((N_CLUSTERS, 2))
gmm = GMM(random_state=42) Score = defaultdict(list) adjMI = defaultdict(list) S_homog = defaultdict(list) S_adjMI = defaultdict(list) S_vm = defaultdict(list) for i in dim: reduced_X = FastICA(n_components=i, random_state=42).fit_transform(X_scaled) k = 10 km.set_params(n_clusters=k) gmm.set_params(n_components=k) km.fit(reduced_X) gmm.fit(reduced_X) Score['km'].append(km.score(reduced_X)) Score['gmm'].append(gmm.score(reduced_X)) S_homog['km'].append( metrics.homogeneity_score(labels, km.predict(reduced_X))) S_homog['gmm'].append( metrics.homogeneity_score(labels, gmm.predict(reduced_X))) S_adjMI['km'].append( metrics.adjusted_mutual_info_score(labels, km.predict(reduced_X))) S_adjMI['gmm'].append( metrics.adjusted_mutual_info_score(labels, gmm.predict(reduced_X))) S_vm['km'].append(metrics.v_measure_score(labels, km.predict(reduced_X))) S_vm['gmm'].append(metrics.v_measure_score(labels, gmm.predict(reduced_X))) #plt.legend(['Train', 'Test'], loc='lower right')
ax.set_ylabel('Model Depth') ax.grid(True) fig.suptitle('RMS difference between reduced and original dataset', fontsize=12) plt.show() # ## GMM Classification # We classify with a GMM the reduce dataset # # Doc: http://scikit-learn.org/stable/modules/mixture.html # In[14]: # Set-up and train the classifier: gmm = GaussianMixture(n_components=K, covariance_type='full', init_params='kmeans', max_iter=1000, tol=1e-6) gmm.fit(Xr) # Training on reduced data # Extract GMM parameters: priors = gmm.weights_ # [K,1] centers= gmm.means_ # [K,Nc] covars = gmm.covariances_ # [K,Nc,Nc] if 'full' # Classify the dataset: LABELS = gmm.predict(Xr) # [Np,1] POST = gmm.predict_proba(Xr) # [Np,Nc] # ## Time for a lot of figures # In[15]:
#Get the data obs_wave, obs_flux = data[:,0], data[:,1] #Center the x data in zero and normalized the y data to the area of the curve n_wave = obs_wave - obs_wave[np.argmax(obs_flux)] n_flux = obs_flux / sum(obs_flux) #Generate a distribution of points matcthing the curve line_distribution = np.random.choice(a = n_wave, size = 100000, p = n_flux) number_points = len(line_distribution) #Run the fit gmm = GaussianMixture(n_components = 4) gmm.fit(np.reshape(line_distribution, (number_points, 1))) gauss_mixt = np.array([p * norm.pdf(n_wave, mu, sd) for mu, sd, p in zip(gmm.means_.flatten(), np.sqrt(gmm.covariances_.flatten()), gmm.weights_)]) gauss_mixt_t = np.sum(gauss_mixt, axis = 0) #Plot the data fig, axis = plt.subplots(1, 1, figsize=(10, 12)) axis.plot(n_wave, n_flux, label = 'Normalized observed flux') axis.plot(n_wave, gauss_mixt_t, label = '4 components fit') for i in range(len(gauss_mixt)): axis.plot(n_wave, gauss_mixt[i], label = 'Gaussian '+str(i), linestyle = '--') axis.set_xlabel('normalized wavelength', fontsize = 15) axis.set_ylabel('normalized flux', fontsize = 15) axis.set_title('Sklearn GM fit', fontsize = 15)
data2 = np.random.multivariate_normal(np.array([2, 2]), cov2, 50) data = np.concatenate((data1, data2), axis=0) classes = np.array([0 for i in range(50)] + [1 for i in range(50)]) return data, classes x, y = generate_dataset() plt.scatter(x[:, 0], x[:, 1], c=y) plt.show() n_components = 2 gauss = GaussianMixture( n_components=n_components, covariance_type="diag" ) # Kovarianztype = Diag nur wenn Kovarianzmatrix Werte auf der Diagnoalen hat gauss.fit(x) print( "Model converged: ", gauss.converged_ ) # Modell hört auf zu itereren wenn Mittelwert und Kovarianzen sich nicht großartig weiter verbessert (innerhalb einer Range). Wert gibt TRUE oder FALSE zurück covs = gauss.covariances_ means = gauss.means_ #print("Cov:\n", covs, "\n") # Kovarianzen in der Originalausgabe print( np.diag(covs[0]), "\n" ) # Kovarianzmatrix zur ersten Normalverteilung (Datenwolke 1) -> Vergleiche Z.25 print( np.diag(covs[1]), "\n" ) # Kovarianzmatrix zur zweiten Normalverteilung (Datenwolke 2) -> Vergleiche Z.27
def main(): print("Generating Part3 Plots") # Intention PCA intention_pca, y_intention = load_intention_PCA_reduced() em = GaussianMixture(2, random_state=1) pulsar_lle, y_pulsar = load_pulsar_LLE_reduced() kmeans = KMeans(2, random_state=1) em.fit(intention_pca) kmeans.fit(pulsar_lle) kmeans_clusters = kmeans.predict(pulsar_lle) em_probs = em.predict_proba(intention_pca)[:, 0] fig = plt.figure(figsize=(10, 8)) ax1 = fig.add_subplot(221, projection="3d") ax2 = fig.add_subplot(222, projection="3d") ax3 = fig.add_subplot(223, projection="3d") ax4 = fig.add_subplot(224, projection="3d") ax1.scatter( intention_pca[:, 0], intention_pca[:, 1], zs=intention_pca[:, 2], c=1 - em_probs, alpha=0.3, ) ax2.scatter( pulsar_lle[:, 0], pulsar_lle[:, 1], zs=pulsar_lle[:, 2], c=1 - kmeans_clusters, alpha=0.3, ) ax3.scatter( intention_pca[:, 0], intention_pca[:, 1], zs=intention_pca[:, 2], c=y_intention, alpha=0.3, ) ax4.scatter( pulsar_lle[:, 0], pulsar_lle[:, 1], zs=pulsar_lle[:, 2], c=y_pulsar, alpha=0.3 ) ax1.set_xlabel("PCA Dimension 1") ax1.set_ylabel("PCA Dimension 2") ax1.set_zlabel("PCA Dimension 3") ax2.set_xlabel("LLE Dimension 1") ax2.set_ylabel("LLE Dimension 2") ax2.set_zlabel("LLE Dimension 3") ax3.set_xlabel("PCA Dimension 1") ax3.set_ylabel("PCA Dimension 2") ax3.set_zlabel("PCA Dimension 3") ax4.set_xlabel("LLE Dimension 1") ax4.set_ylabel("LLE Dimension 2") ax4.set_zlabel("LLE Dimension 3") ax1.set_title("EM-Predicted Clusters on PCA") ax2.set_title("K-Means Predicted Clusters on LLE") ax3.set_title("True Labels in PCA Embedding") ax4.set_title("True Labels in LLE Embedding") plot_dir = os.path.join("plots", "part3") plt.savefig(os.path.join(plot_dir, "BestClustering.png")) plt.close() intention_pca_datafile = get_datafile_path("intention", "pca") intention_ica_datafile = get_datafile_path("intention", "ica") intention_rp_datafile = get_datafile_path("intention", "rp") intention_lle_datafile = get_datafile_path("intention", "lle") pulsar_pca_datafile = get_datafile_path("pulsar", "pca") pulsar_ica_datafile = get_datafile_path("pulsar", "ica") pulsar_rp_datafile = get_datafile_path("pulsar", "rp") pulsar_lle_datafile = get_datafile_path("pulsar", "lle") save_clustering_plots(intention_pca_datafile, "intention_pca", plot_dir) save_clustering_plots(intention_ica_datafile, "intention_ica", plot_dir) save_clustering_plots(intention_rp_datafile, "intention_rp", plot_dir) save_clustering_plots(intention_lle_datafile, "intention_lle", plot_dir) save_clustering_plots(pulsar_pca_datafile, "pulsar_pca", plot_dir) save_clustering_plots(pulsar_ica_datafile, "pulsar_ica", plot_dir) save_clustering_plots(pulsar_rp_datafile, "pulsar_rp", plot_dir) save_clustering_plots(pulsar_lle_datafile, "pulsar_lle", plot_dir) print("Intention pca results") print_evaluation_stats(intention_pca_datafile) print() print("Intention ica results") print_evaluation_stats(intention_ica_datafile, kmeans_clusters=3) print() print("Intention rp results") print_evaluation_stats(intention_rp_datafile) print() print("Intention lle results") print_evaluation_stats(intention_lle_datafile) print() print("pulsar pca results") print_evaluation_stats(pulsar_pca_datafile) print() print("pulsar ica results") print_evaluation_stats(pulsar_ica_datafile) print() print("pulsar rp results") print_evaluation_stats(pulsar_rp_datafile) print() print("pulsar lle results") print_evaluation_stats(pulsar_lle_datafile) print() pulsar_PCA_X, pulsar_PCA_y = load_pulsar_PCA_reduced() intention_PCA_X, intention_PCA_y = load_intention_PCA_reduced() pulsar_ICA_X, pulsar_ICA_y = load_pulsar_ICA_reduced() intention_ICA_X, intention_ICA_y = load_intention_ICA_reduced() pulsar_RP_X, pulsar_RP_y = load_pulsar_RP_reduced() intention_RP_X, intention_RP_y = load_intention_RP_reduced() pulsar_LLE_X, pulsar_LLE_y = load_pulsar_LLE_reduced() intention_LLE_X, intention_LLE_y = load_intention_LLE_reduced() datafile = os.path.join(data_folder, "intention_pca_clustering.json") print_clustering_stats(intention_LLE_X, intention_LLE_y)
def fit_new(self, x, label): self.y.append(label) gmm = GMM(self.gmm_order) gmm.fit(x) self.gmms.append(gmm)
data1 = np.random.multivariate_normal(mu1_fact, cov_fact, 400) mu2_fact = (2, 2, 1) cov_fact = np.identity(3) data2 = np.random.multivariate_normal(mu2_fact, cov_fact, 100) data = np.vstack((data1, data2)) y = np.array([True] * 400 + [False] * 100) if style == 'sklearn': g = GaussianMixture(n_components=2, covariance_type='full', tol=1e-6, max_iter=1000) """ 'full' (each component has its own general covariance matrix), 'tied' (all components share the same general covariance matrix), 'diag' (each component has its own diagonal covariance matrix), 'spherical' (each component has its own single variance). """ g.fit(data) print '类别概率:\t', g.weights_[0] print '均值:\n', g.means_, '\n' print '方差:\n', g.covariances_, '\n' mu1, mu2 = g.means_ sigma1, sigma2 = g.covariances_ else: num_iter = 100 n, d = data.shape # 随机指定 # mu1 = np.random.standard_normal(d) # print mu1 # mu2 = np.random.standard_normal(d) # print mu2 mu1 = data.min(axis=0) mu2 = data.max(axis=0)
delimiter=',', skiprows=1) print data.shape y, x = np.split(data, [ 1, ], axis=1) x, x_test, y, y_test = train_test_split(x, y, train_size=0.6, random_state=0) gmm = GaussianMixture(n_components=2, covariance_type='full', random_state=0) x_min = np.min(x, axis=0) x_max = np.max(x, axis=0) gmm.fit(x) print '均值 = \n', gmm.means_ print '方差 = \n', gmm.covariances_ y_hat = gmm.predict(x) y_test_hat = gmm.predict(x_test) change = (gmm.means_[0][0] > gmm.means_[1][0]) if change: z = y_hat == 0 y_hat[z] = 1 y_hat[~z] = 0 z = y_test_hat == 0 y_test_hat[z] = 1 y_test_hat[~z] = 0 acc = np.mean(y_hat.ravel() == y.ravel()) acc_test = np.mean(y_test_hat.ravel() == y_test.ravel()) acc_str = u'训练集准确率:%.2f%%' % (acc * 100)
kmeans = KMeans(n_clusters=4, init='k-means++', max_iter=500, n_init=20, random_state=0) y_pred = kmeans.fit_predict(X) plt.scatter(X['Age'], X['Spending Score (1-100)']) plt.ylabel("Spending Score") plt.xlabel("Age") plt.title("Clusters found by KMeans") plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], s=300, c='black') plt.show() from sklearn.mixture import GaussianMixture n_clusters = 5 gmm_model = GaussianMixture(n_components=n_clusters, random_state=5) gmm_model.fit(X) cluster_labels = gmm_model.predict(X) X = pd.DataFrame(X) X['cluster'] = cluster_labels color=['blue','green','red', 'black', 'yellow'] for k in range(0,n_clusters): data = X[X["cluster"]==k].copy() plt.scatter(data["Age"],data["Spending Score (1-100)"],c=color[k]) plt.title("Clusters Identified by Guassian Mixture Model") plt.ylabel("Spending Score (1-100)") plt.xlabel("Age")
l = 256 im = np.zeros((l, l)) points = l*np.random.random((2, n**2)) im[(points[0]).astype(np.int), (points[1]).astype(np.int)] = 1 im = ndimage.gaussian_filter(im, sigma=l/(4.*n)) mask = (im > im.mean()).astype(np.float) img = mask + 0.3*np.random.randn(*mask.shape) hist, bin_edges = np.histogram(img, bins=60) bin_centers = 0.5*(bin_edges[:-1] + bin_edges[1:]) classif = GaussianMixture(n_components=2) classif.fit(img.reshape((img.size, 1))) threshold = np.mean(classif.means_) binary_img = img > threshold plt.figure(figsize=(11,4)) plt.subplot(131) plt.imshow(img) plt.axis('off') plt.subplot(132) plt.plot(bin_centers, hist, lw=2) plt.axvline(0.5, color='r', ls='--', lw=2) plt.text(0.57, 0.8, 'histogram', fontsize=20, transform = plt.gca().transAxes) plt.yticks([])
visualizer.fit(results) # Fit the data to the visualizer # Finalize and render the figure visualizer.show( outpath="charts/creditcards.k-means.Randomized.SilhouetteVisualizer.png") lowest_bic = np.infty bic = [] n_components_range = range(1, 4) cv_types = ['spherical', 'tied', 'diag', 'full'] for cv_type in cv_types: for n_components in n_components_range: # Fit a Gaussian mixture with EM gmm = GaussianMixture(n_components=n_components, covariance_type=cv_type) gmm.fit(results) bic.append(gmm.bic(results)) if bic[-1] < lowest_bic: lowest_bic = bic[-1] best_gmm = gmm bic = np.array(bic) color_iter = itertools.cycle( ['navy', 'turquoise', 'cornflowerblue', 'darkorange']) clf = best_gmm bars = [] # Plot the BIC scores plt.figure(figsize=(8, 6)) spl = plt.subplot(2, 1, 1) for i, (cv_type, color) in enumerate(zip(cv_types, color_iter)):
def main(): parser = argparse.ArgumentParser( description='Train VaDE with MNIST dataset', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--epochs', '-e', help='Number of epochs.', type=int, default=20) parser.add_argument('--gpu', '-g', help='GPU id. (Negative number indicates CPU)', type=int, default=-1) parser.add_argument('--learning-rate', '-l', help='Learning Rate.', type=float, default=0.001) parser.add_argument('--batch-size', '-b', help='Batch size.', type=int, default=128) parser.add_argument('--out', '-o', help='Output path.', type=str, default='./vade_parameter.pth') args = parser.parse_args() if_use_cuda = torch.cuda.is_available() and args.gpu >= 0 device = torch.device('cuda:{}'.format(args.gpu) if if_use_cuda else 'cpu') dataset = datasets.MNIST('./data', train=True, download=True, transform=transforms.ToTensor()) data_loader = torch.utils.data.DataLoader(dataset, batch_size=args.batch_size, shuffle=True, num_workers=2, pin_memory=if_use_cuda) pretrain_model = AutoEncoderForPretrain(784, 10).to(device) optimizer = torch.optim.Adam(pretrain_model.parameters(), lr=args.learning_rate) for epoch in range(1, args.epochs + 1): train(pretrain_model, data_loader, optimizer, device, epoch) with torch.no_grad(): x = torch.cat([data[0] for data in dataset]).view(-1, 784).to(device) z = pretrain_model.encode(x).cpu() pretrain_model = pretrain_model.cpu() state_dict = pretrain_model.state_dict() gmm = GaussianMixture(n_components=10, covariance_type='diag') gmm.fit(z) model = VaDE(N_CLASSES, 784, 10) model.load_state_dict(state_dict, strict=False) model._pi.data = torch.log(torch.from_numpy(gmm.weights_)).float() model.mu.data = torch.from_numpy(gmm.means_).float() model.logvar.data = torch.log(torch.from_numpy(gmm.covariances_)).float() torch.save(model.state_dict(), args.out)
t = PrettyTable(['Method', 'Accuracy']) km = KMeans(k, n_init=5) km.fit(Y) zh_kmeans = km.labels_ x1_kmeans = X[np.where(zh_kmeans==0)][:, np.newaxis] x2_kmeans = X[np.where(zh_kmeans==1)][:, np.newaxis] x1_mu_kmeans, x2_mu_kmeans = km.cluster_centers_ x1_mu_kmeans, x2_mu_kmeans = x1_mu_kmeans[0], x2_mu_kmeans[0] x1_var_kmeans, x2_var_kmeans = np.var(x1_kmeans), np.var(x2_kmeans) acc_kmeans = metric.accuracy(z, zh_kmeans) t.add_row(['k-means', acc_kmeans]) gm = GMM(k, n_init=5, init_params="kmeans") gm.fit(Y) zh_gmm = gm.predict(Y) #x1_gmm = X[np.where(zh_gmm==0)][:, np.newaxis] #x2_gmm = X[np.where(zh_gmm==1)][:, np.newaxis] x1_mu_gmm, x2_mu_gmm = gm.means_ x1_mu_gmm, x2_mu_gmm = x1_mu_gmm[0], x2_mu_gmm[0] x1_var_gmm, x2_var_gmm = gm.covariances_ x1_var_gmm, x2_var_gmm = x1_var_gmm[0][0], x2_var_gmm[0][0] acc_gmm = metric.accuracy(z, zh_gmm) t.add_row(['gmm', acc_gmm]) G = eclust.kernel_matrix(Y, lambda x, y: np.linalg.norm(x-y)) zh_kgroups = wrapper.kernel_kgroups(k, Y, G) x1_kgroups = X[np.where(zh_kgroups==0)][:, np.newaxis] x2_kgroups = X[np.where(zh_kgroups==1)][:, np.newaxis] acc_kgroups = metric.accuracy(z, zh_kgroups)
def fit_gmm( max_components, n_distances, atoms, distances, regularization_type="bic", covariance_type="diag", ): """ Fit a GMM to a set of distances. This routine will fit a Gaussian mixture model from a set of input distances using sklearn_. The resulting set of parameters can be used to initialize a `GMMDistanceRestraint` in a MELD simulation. .. _sklearn: http://scikit-learn.org/stable/modules/mixture.html Parameters ---------- max_components: int Maximum number of components to use in fitting GMM. n_distances: int Number of distances involved in GMM atoms: list of (int, str, int, str) tuples. The atoms that are involved in each distance are specified as a list of `n_distances` tuples, each of the form (r1, n1, r2, n2), where r1, r2 are the integer residue indices starting from one, and n1, n2 are the atom names. distances: array_like(n_dim=2) An (n_samples, n_distances) array of distances (in nm) to fit. regularization_type: str The type of regularization to use, options are "bic" and "dirichlet". covariance_type: str The form of the covariance matrix, options are "diag" and "full". Returns ------- GMMParams The fit parameters, which can be used to initialize a `meld.system.restraints.GMMDistanceRestraint` using ``GMMDistanceRestraint.from_params``. Notes ----- There are two ways to regularize in order to prevent over fitting. ``regularization_type="bic"`` will use the Bayesian information criterion to penalize models that have more parameters. When using ``bic``, The final number of components in the model will be less than or equal to `max_components`. ``regularization_type=dirichlet`` will use a Dirichlet process prior on the weight distributions. The final number of components in the model will always be equal to `max_components`, but most of the weights will be small. There are two forms for the covariance matrix, which differ in the number of parameters and expressiveness. ``covariance_type="diag"`` will fit using a diagonal covariance matrix. This has few parameters, but does not capture correlations between input distances. Typically, choosing ``"diag"`` will result in a model with more components. ``covariance_type="full"`` will fit using a full representation of the covariance matrix. This captures correlations between input distances, but has far more parameters and is potentially prone to over fitting. """ # # Constants # N_INIT = 25 MAX_ITER = 1000 KFOLD_SPLITS = 5 REG_COVAR = 1e-4 RANDOMSEARCH_TRIALS = 32 # # Check the inputs # if distances.shape[1] != n_distances: raise ValueError("distances must have shape (n_samples, n_distances)") if len(atoms) != n_distances: raise ValueError( "atoms must be a list of (ind1, name1, ind2, name2) of " "length n_components" ) if regularization_type not in ["bic", "dirichlet"]: raise ValueError('regularization_type must be one of ["bic", "dirichlet"]') if covariance_type not in ["diag", "full"]: raise ValueError('covariance_type must be one of ["diag", "full"]') if max_components < 1: raise ValueError("max_components must be >= 1") if max_components > 32: raise ValueError("MELD supports a maximum of 32 GMM components") # # Create and fit the model # if regularization_type == "bic": # BIC fit # Search different values of n_components to find the minimal # BIC. models = [] for i in range(1, max_components + 1): g = GaussianMixture( n_components=i, n_init=N_INIT, max_iter=MAX_ITER, covariance_type=covariance_type, reg_covar=REG_COVAR, ) g.fit(distances) models.append((g.bic(distances), g)) gmm = sorted(models, key=lambda x: x[0])[0][1] else: # Dirichlet process fit # use RandomSearchCV to optimize hyperparameters params = { "weight_concentration_prior": LogUniformSampler(1e-6, 10), "mean_precision_prior": LogUniformSampler(1, 10), } model = BayesianGaussianMixture( max_components, n_init=N_INIT, max_iter=MAX_ITER, covariance_type=covariance_type, reg_covar=REG_COVAR, ) rs = RandomizedSearchCV( model, param_distributions=params, n_iter=RANDOMSEARCH_TRIALS, cv=KFold(n_splits=KFOLD_SPLITS, shuffle=True), ) rs.fit(distances) gmm = rs.best_estimator_ # turn the vector representation of the diagonal into a full # precision matrix if covariance_type == "diag": precisions = gmm.precisions_ assert len(precisions.shape) == 2 new_precisions = [] for i in range(precisions.shape[0]): new_precisions.append(np.diag(precisions[i, :])) precisions = np.array(new_precisions) else: precisions = gmm.precisions_ # convert the list of atoms into the correct form new_atoms = [] for r1, n1, r2, n2 in atoms: new_atoms.append((r1, n1)) new_atoms.append((r2, n2)) # Return the parameters for a GMM return GMMParams( n_components=gmm.weights_.shape[0], n_distances=n_distances, atoms=new_atoms, weights=gmm.weights_, means=gmm.means_, precisions=precisions, )
X_train_tsne = tsne.fit_transform(X_train) X_score_tsne = tsne.fit_transform(X_score) # ====== lda ====== # lda = LinearDiscriminantAnalysis(n_components=NUM_DIM) lda.fit(X_train, y_train) X_train_lda = lda.transform(X_train) X_score_lda = lda.transform(X_score) # ====== plda ====== # plda = PLDA(n_phi=NUM_DIM, random_state=SEED) plda.fit(X_train, y_train) X_train_plda = plda.predict_log_proba(X_train) X_score_plda = plda.predict_log_proba(X_score) # ====== gmm ====== # gmm = GaussianMixture(n_components=NUM_DIM, max_iter=100, covariance_type='full', random_state=SEED) gmm.fit(X_train) X_train_gmm = gmm._estimate_weighted_log_prob(X_train) X_score_gmm = gmm._estimate_weighted_log_prob(X_score) # ====== rbm ====== # rbm = BernoulliRBM(n_components=NUM_DIM, batch_size=8, learning_rate=0.0008, n_iter=8, verbose=2, random_state=SEED) rbm.fit(X_train) X_train_rbm = rbm.transform(X_train) X_score_rbm = rbm.transform(X_score) # =========================================================================== # Deep Learning # =========================================================================== # =========================================================================== # Visualize # ===========================================================================
plt.close() n_components = range(2, 31) Cancer_EM_aic = [] Cancer_EM_bic = [] Cancer_EM_score = [] Cancer_EM_homogeneity_score = [] Cancer_EM_complete_score = [] Cancer_EM_log = [] Cancer_EM_train_acc = [] Cancer_EM_cv_acc = [] for i in n_components: print(i) EM.set_params(random_state=7641, n_components=i) EM.fit(Cancer_X) Cancer_EM_score.append(EM.score(Cancer_X_train)) Cancer_EM_bic.append(EM.bic(Cancer_X_train)) Cancer_EM_aic.append(EM.aic(Cancer_X_train)) Cancer_EM_log.append( silhouette_score(Cancer_X_train, EM.predict(Cancer_X_train))) Cancer_EM_homogeneity_score.append( homogeneity_score(Cancer_y_train, EM.predict(Cancer_X_train))) Cancer_EM_complete_score.append( completeness_score(Cancer_y_train, EM.predict(Cancer_X_train))) Cancer_scores = cross_validate(EM, Cancer_X_train, Cancer_y_train, cv=5, scoring=make_scorer(my_custom_acc, greater_is_better=True),
start = timer() zh = kernel_kmeans(k, G, Z0, W) end = timer() Zh = ztoZ(zh) t.add_row(["kernel k-means (k-means++)", metric.accuracy(z, zh), objective(Zh, G, W), end-start]) start = timer() zh = kernel_kmeans(k, G, Z1, W) end = timer() Zh = ztoZ(zh) t.add_row(["kernel k-means (spectral)", metric.accuracy(z, zh), objective(Zh, G, W), end-start]) start = timer() gmm = GMM(k) gmm.fit(X) zh = gmm.predict(X) end = timer() t.add_row(["GMM", metric.accuracy(z, zh), "-", end-start]) start = timer() km = KMeans(k) zh = km.fit_predict(X) end = timer() t.add_row(["k-means", metric.accuracy(z, zh), "-", end-start]) print t
def cluster_silh_plot(prefix, clustermethod, drmethod, range_n_clusters, X, plotdim, seed=seed): silhouette_avgs = [] sample_silhouette_nvalues = [] cluster_nlabels = [] clusterers = [] cluster_scores = ["method,drmethod,nclusters,score"] for n_clusters in range_n_clusters: # Initialize the clusterer with n_clusters value and a random generator # seed for reproducibility. if clustermethod == 'GM': name = 'GaussianMixture' clusterer = GaussianMixture(n_components=n_clusters, random_state=seed) if clustermethod == 'KM': name = 'KMeans' clusterer = KMeans(n_clusters=n_clusters, random_state=seed) clusterers.append(clusterer) # Predict cluster labels cluster_labels = clusterer.fit(X).predict(X) cluster_nlabels.append(cluster_labels) # The silhouette_score gives the average value for all the samples. # This gives a perspective into the density and separation of the formed # clusters silhouette_avg = silhouette_score(X, cluster_labels) silhouette_avgs.append(silhouette_avg) print("For n_clusters =", n_clusters, "The average silhouette_score is :", silhouette_avg) cluster_scores.append( "%s,%s,%d,%.10f" % (clustermethod, drmethod, n_clusters, silhouette_avg)) # Compute the silhouette scores for each sample sample_silhouette_nvalues.append(silhouette_samples(X, cluster_labels)) highest_score = -1 n_clusters = None cluster_labels = None sample_silhouette_values = None silhouette_avg = None clusterer = None for i, v in enumerate(silhouette_avgs): if v > highest_score: n_clusters = range_n_clusters[i] silhouette_avg = silhouette_avgs[i] sample_silhouette_values = sample_silhouette_nvalues[i] cluster_labels = cluster_nlabels[i] clusterer = clusterers[i] highest_score = v print("highest silhoutte score = %.10f" % (silhouette_avg)) print("n_clusters with highest score = %d" % (n_clusters)) print("plotting...") figname = "%s-%s-%s-clusters.png" % (label.replace( " ", "-"), clustermethod, drmethod) plot_clusters_save(prefix, clustermethod, name, X, cluster_labels, n_clusters, plotdim, figname) figname = "%s-%s-%s-%d.png" % (label.replace( " ", "-"), clustermethod, drmethod, n_clusters) plot_silh_save(prefix, clustermethod, name, n_clusters, X, cluster_labels, clusterer, silhouette_avg, sample_silhouette_values, figname) with open('%s-%s-silhscores.csv' % (prefix.replace(" ", "-"), drmethod), "w") as f: for line in cluster_scores: f.write("%s\n" % (line)) return cluster_labels, silhouette_avgs
for max_iter in [50, 100, 200, 300]: params = {'n_clusters': n_clusters, 'max_iter': max_iter} k_means = KMeans(**params) k_means.fit(df) df['k_means_id'] = k_means.labels_ plt.scatter(df['x'], df['y'], c=df['k_means_id'], alpha=0.5) text = "K-means, n-clusters: " + str(n_clusters) + ",max_iter: " + str( max_iter) plt.title(text) plt.show() #EM clustering========================================== from sklearn.mixture import GaussianMixture for n_components in [2, 3, 4, 5, 6]: for max_iter in [50, 100, 200, 300]: params = {'n_components': n_components, 'max_iter': max_iter} gmm = GaussianMixture(**params) gmm.fit(df) y_predict = gmm.predict(df) df['EM_id'] = y_predict plt.figure(figsize=(8, 8)) plt.scatter(df['x'], df['y'], c=df['EM_id'], alpha=0.5) text = "EM, n_components: " + str(n_components) + ",max_iter: " + str( max_iter) plt.title(text) plt.show()
sm_init[sm_init == 0] = 1 sm_final[sm_final == 0] = 1 init = reverse_histogram(np.log10(sm_init)) final = reverse_histogram(np.log10(sm_final)) #init = np.vstack((lf_flat, hf_i_flat)).T #final = np.vstack((lf_flat, hf_f_flat)).T m_init = np.array([[ 0., 0.], [ 54., 10.], [ 80., 75.]]) gmm = GaussianMixture(n_components=3, means_init=m_init) gmm.fit(init) means_i = gmm.means_ cov_i = gmm.covariances_ m_final = np.array([[ 0., 0.], [ 42., 17.], [ 80., 75.]]) gmm = GaussianMixture(n_components=3, means_init=m_final) gmm.fit(final) means_f = gmm.means_ cov_f = gmm.covariances_ plot_log_histogram(histo_init, means_i, vmax=100, interp="bicubic") pl.scatter(m_init[:,0], m_init[:,1], s=100, marker="^") plot_log_histogram(histo_final, means_f, vmax=100, interp="bicubic") pl.scatter(m_final[:,0], m_final[:,1], s=100, marker="^")
# Plot real cluster plt.subplot(2, 2, 1) plt.scatter(X.Petal_Length, X.Petal_Width, c=colormap[y.Targets], s=40) plt.title('Real Clusters') plt.xlabel('Petal Length') plt.ylabel('Petal Width') # Plot K-means cluster plt.subplot(2, 2, 2) plt.scatter(X.Petal_Length, X.Petal_Width, c=colormap[model.labels_], s=40) plt.title('K-Means Clustering') plt.xlabel('Petal Length') plt.ylabel('Petal Width') # General EM for GMM from sklearn import preprocessing scaler = preprocessing.StandardScaler() scaler.fit(X) xsa = scaler.transform(X) xs = pd.DataFrame(xsa, columns = X.columns) from sklearn.mixture import GaussianMixture gmm = GaussianMixture(n_components = 3) gmm.fit(xs) gmm_y = gmm.predict(xs) plt.subplot(2, 2, 3) plt.scatter(X.Petal_Length, X.Petal_Width, c=colormap[gmm_y], s=40) plt.title('GMM Clustering') plt.xlabel('Petal Length') plt.ylabel('Petal Width') print('Observation: The GMM using EM algorithm based clustering matched the true labels more closely than the Kmeans.')
def fit_gmm_to_points(points, n_components, mdl, ps=[], num_iter=100, covariance_type='full', min_covar=0.001, init_centers=[], force_radii=-1.0, force_weight=-1.0, mass_multiplier=1.0): """fit a GMM to some points. Will return the score and the Akaike score. Akaike information criterion for the current model fit. It is a measure of the relative quality of the GMM that takes into account the parsimony and the goodness of the fit. if no particles are provided, they will be created points: list of coordinates (python) n_components: number of gaussians to create mdl: IMP Model ps: list of particles to be decorated. if empty, will add num_iter: number of EM iterations covariance_type: covar type for the gaussians. options: 'full', 'diagonal', 'spherical' min_covar: assign a minimum value to covariance term. That is used to have more spherical shaped gaussians init_centers: initial coordinates of the GMM force_radii: fix the radii (spheres only) force_weight: fix the weights mass_multiplier: multiply the weights of all the gaussians by this value dirichlet: use the DGMM fitting (can reduce number of components, takes longer) """ new_sklearn = False try: from sklearn.mixture import GMM except ImportError: from sklearn.mixture import GaussianMixture new_sklearn = True print('creating GMM with n_components',n_components,'n_iter',num_iter,'covar type',covariance_type) if new_sklearn: # aic() calls size() on points, so it needs to a numpy array, not a list points = np.array(points) weights_init = precisions_init = None if force_radii != -1.0: print('warning: radii can no longer be forced, but setting ' 'initial values to ', force_radii) precisions_init = np.array([[1./force_radii]*3 for i in range(n_components)]) if force_weight != -1.0: print('warning: weights can no longer be forced, but setting ' 'initial values to ', force_weight) weights_init = np.array([force_weight]*n_components) gmm = GaussianMixture(n_components=n_components, max_iter=num_iter, covariance_type=covariance_type, weights_init=weights_init, precisions_init=precisions_init, means_init=None if init_centers==[] else init_centers) else: params='m' init_params='m' if force_radii==-1.0: params+='c' init_params+='c' else: covariance_type='spherical' print('forcing spherical with radii',force_radii) if force_weight==-1.0: params+='w' init_params+='w' else: print('forcing weights to be',force_weight) gmm = GMM(n_components=n_components, n_iter=num_iter, covariance_type=covariance_type, min_covar=min_covar, params=params, init_params=init_params) if force_weight!=-1.0: gmm.weights_=np.array([force_weight]*n_components) if force_radii!=-1.0: gmm.covars_=np.array([[force_radii]*3 for i in range(n_components)]) if init_centers!=[]: gmm.means_=init_centers print('fitting') model=gmm.fit(points) score=gmm.score(points) akaikescore=model.aic(points) #print('>>> GMM score',gmm.score(points)) ### convert format to core::Gaussian if new_sklearn: covars = gmm.covariances_ else: covars = gmm.covars_ for ng in range(n_components): covar=covars[ng] if covar.size==3: covar=np.diag(covar).tolist() else: covar=covar.tolist() center=list(gmm.means_[ng]) weight=mass_multiplier*gmm.weights_[ng] if ng>=len(ps): ps.append(IMP.Particle(mdl)) shape=IMP.algebra.get_gaussian_from_covariance(covar,IMP.algebra.Vector3D(center)) g=IMP.core.Gaussian.setup_particle(ps[ng],shape) IMP.atom.Mass.setup_particle(ps[ng],weight) IMP.core.XYZR.setup_particle(ps[ng],sqrt(max(g.get_variances()))) return (score,akaikescore)
# Using the elbow method to find the optimal number of clusters wcss = [] for i in range(1, 15): kmeans = KMeans(n_clusters=i, init='k-means++', random_state=1) kmeans.fit(X) wcss.append(kmeans.inertia_) plt.plot(range(1, 15), wcss) plt.title('Finding the Best K: The Elbow Method') plt.xlabel('Number of clusters') plt.ylabel('WCSS') plt.show() # Fitting K-Means to the dataset (4 clusters) kmeans = KMeans(n_clusters=4, init='k-means++', random_state=1) y_kmeans = kmeans.fit_predict(X) # Check against known classifications cm = confusion_matrix(y_kmeans, Y) print(Y_Results) print(pd.DataFrame(cm)) # Run EM using GaussianMixture (4 clusters) EM = GaussianMixture(n_components=4, random_state=1) fit = EM.fit(X) labels = fit.predict(X) # Generate confusion matrix to compare to actual results cm = confusion_matrix(labels, Y) print(Y_Results) print(pd.DataFrame(cm))
#使用最佳的eps进行DBScan聚类 db = DBSCAN(eps=eps, min_samples=3).fit(XYMatrix) db_label = np.array([i + 1 for i in db.labels_]) score = silhouette_score(XYMatrix, db_label) print score writeHTML(clusterType="DBSCAN_final", clusterLabel=db_label) #Kmeans聚类与GMM聚类进行比较 kmean_classes = len(np.unique(cluster_labels)) #GMM聚类 n_components = kmean_classes gmm = GaussianMixture(n_components=kmean_classes, max_iter=20, random_state=0) gmm.means_init = np.array( [XYMatrix[cluster_labels == i].mean(axis=0) for i in range(kmean_classes)]) gmm.fit(XYMatrix) gmm_labels = gmm.predict(XYMatrix) #以Kmeans为基础,计算GMM的准确率 train_accuracy = np.mean(gmm_labels.ravel() == cluster_labels.ravel()) * 100 print "gmm - kmeans accuracy : ", train_accuracy #去除DBScan算法认定的噪音 no_noise_matrix = np.array(XYMatrix[db_label != 0]) no_noise_label = np.array(db_label[db_label != 0]) dbscan_class = len(np.unique(no_noise_label)) gmm = GaussianMixture(n_components=dbscan_class, random_state=0) gmm.means_init = np.array([ no_noise_matrix[no_noise_label == i].mean(axis=0) for i in range(dbscan_class)
def __init__(self, pulse_times_A, pulse_times_B, units_A=1, units_B=1, chunk_size=5, plot=False, raise_exception=True): '''Class for converting timestamps between two recording systems (e.g pyControl and an ephys) using sync pulses with random inter-pulse intervals recorded on both systems. Typically these sync pulses are generated by pyControl using the Rsync hardware object and sent to other systems. To use the Rsync_aligner,instantiate it by providing the sync pulse times recorded by each system. Timestamps from either system can then be converted into the reference frame of the other using the A_to_B and B_to_A methods. If the hardware systems use different units to measure time this must be specified using the units arguments when the aligner is instantiated. When the aligner is instantiated it works out which pulses in each reference frame correspond to each other by by aligning short chunks of pulse sequence A with B by minimising the mean squared error between inter-pulse intervals. Arguments: pulse_times_A: The times when sync pulses occured recorded by hardware system A. pulse_times_B: The times when sync pulses occured recorded by hardware system B. units_A: The time units used by system A expressed in milliseconds. E.g. if system A uses units of seconds the *units_A* argument is 1000. units_B: The time units used by system B expressed in milliseconds. plot: Whether to plot information about the alignment. raise_exception: If *True* an RsyncError exception is raised if no match is found between the sync pulse sequences. ''' # Convert all units to ms. pulse_times_A = pulse_times_A * units_A pulse_times_B = pulse_times_B * units_B # Evalute inter pulse intervals intervals_A = np.diff( pulse_times_A) # Inter-pulse intervals for sequence A intervals_B = np.diff( pulse_times_B) # Inter-pulse intervals for sequence B intervals_B2 = intervals_B**2 # Find alignments of chunks which minimise sum of squared errors. chunk_starts_A = np.arange( 0, len(pulse_times_A) - chunk_size, chunk_size) # Start indices of each chunk of sequence A. chunk_starts_B = np.zeros( chunk_starts_A.shape, int) # Start indicies of corresponding chunks in B. chunk_min_mse = np.zeros( chunk_starts_A.shape ) # Mean squared error for each chunks best alignment. chunk_2nd_mse = np.zeros( chunk_starts_A.shape ) # Mean sqared error for each chunks second best (i.e non matching) alignment. ones_chunk = np.ones(chunk_size) for i, csA in enumerate(chunk_starts_A): chunk_A = intervals_A[csA:csA + chunk_size] mse = (np.correlate(intervals_B2, ones_chunk, mode='valid') + np.sum(chunk_A**2) - 2 * np.correlate( intervals_B, chunk_A, mode='valid')) / chunk_size chunk_starts_B[i] = np.argmin(mse) sorted_chunk_min_mse = np.sort(mse) chunk_min_mse[i] = sorted_chunk_min_mse[0] chunk_2nd_mse[i] = sorted_chunk_min_mse[1] # Assign chunks to matched and non-matched groups by fitting 2 component # Gaussian mixture model to log mse distribition of best + second best # alignments. log_mse = np.log(np.hstack([chunk_min_mse, chunk_2nd_mse])) log_mse = log_mse[np.isfinite(log_mse)].reshape(-1, 1) gmm = GaussianMixture(n_components=2, covariance_type='spherical') gmm.fit(log_mse) valid_matches = gmm.predict(log_mse) == np.argmin( gmm.means_) # True for chunks which are valid matches. # Make arrays of corresponding times. cor_times_A = np.full( pulse_times_B.shape, np.nan) # A pulse times corresponding to each B pulse. cor_times_B = np.full( pulse_times_A.shape, np.nan) # B pulse times corresponding to each A pulse. for csA, csB, valid in zip(chunk_starts_A, chunk_starts_B, valid_matches): if valid: cor_times_A[csB:csB + chunk_size] = pulse_times_A[csA:csA + chunk_size] cor_times_B[csA:csA + chunk_size] = pulse_times_B[csB:csB + chunk_size] # Store pulse times, their correspondences and units. self.pulse_times_A = pulse_times_A self.pulse_times_B = pulse_times_B self.cor_times_A = cor_times_A self.cor_times_B = cor_times_B self.units_A = units_A self.units_B = units_B # Check quality of alignment. separation_OK = ( np.abs(gmm.means_[0] - gmm.means_[1])[0] > # Different in GMM means > 3 x sum of standard deviations. 3 * np.sum(np.sqrt(gmm.covariances_))) order_OK = ((np.nanmin(np.diff(cor_times_A)) > 0) and (np.nanmin(np.diff(cor_times_A)) > 0) ) # Corresponding times are monotonically increacing. if not (separation_OK and order_OK): if raise_exception: raise RsyncError( 'No match found between inter-pulse interval sequences.') else: print( 'Rsync warning: No match found between inter-pulse interval sequences.' ) # Plotting if plot: plt.figure(plot if type(plot) == int else 1, figsize=[7, 9]).clf() plt.subplot2grid((3, 3), (0, 0), rowspan=1, colspan=2) plt.hist(log_mse[valid_matches], 20, color='b', label='Match') plt.hist(log_mse[~valid_matches], 20, color='r', label='Non-match') plt.legend(loc='upper center') plt.xlabel('Log mean squared error') plt.ylabel('# chunks') plt.subplot2grid((3, 3), (0, 2), rowspan=1, colspan=1) timing_errors = np.diff(cor_times_A) - np.diff(pulse_times_B) plt.hist(timing_errors[~np.isnan(timing_errors)], 20) plt.xlabel('Inter-pulse interval\ndiscrepancy (ms)') plt.ylabel('# pulses') plt.subplot2grid((3, 1), (1, 0), rowspan=2, colspan=1) plt.plot(pulse_times_A / units_A, cor_times_B / units_B, '.', markersize=2) plt.xlim(pulse_times_A[0] / units_A, pulse_times_A[-1] / units_A) plt.xlabel('pulse times A') plt.ylabel('pulse times B') plt.tight_layout()
print('Unimodal Gaussian Fit: Mean {:.4}, stdev {:.4}'.format(mu, sig)) plt.hist(data, bins='auto', alpha=.3, normed=True) ############################################################################## # As expected, the result is rather silly, since we are only fitting *one* # of the two gaussians. ############################################################################## # Fit Gaussian Mixture Model (GMM) # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # # Assuming the data is the sum of one or more gaussians. # Easily handles multidimensional case as well. gmm = GaussianMixture(n_components=2, covariance_type='spherical') gmm.fit(data) mu1 = gmm.means_[0, 0] mu2 = gmm.means_[1, 0] var1, var2 = gmm.covariances_ wgt1, wgt2 = gmm.weights_ print( '''Fit: 1: Mean {:.4}, var {:.4}, weight {:.4} 2: Mean {:.4}, var {:.4}, weight {:.4} '''.format(mu1, var1, wgt1, mu2, var2, wgt2) ) plt.hist(data, bins='auto', alpha=.3, normed=True) plt.vlines((mu1, mu2), ymin=0, ymax=0.35, label='Fitted Means') plt.plot(x, norm.pdf(x, mu1, np.sqrt(var1)))
def GMM(distro, n_components = 2): GM = GaussianMixture(n_components) GM.fit(np.array(distro).reshape((-1, 1))) return GM
n_clusters = int(clustering_params[0]) n_iter = int(clustering_params[1]) thresh = float(clustering_params[2]) n_restarts = int(clustering_params[3]) # Make data array to be put through the GMM - 5 components: 3 PCs, scaled energy, amplitude this_cluster = np.where(predictions == int(clusters[0]))[0] n_pc = 3 data = np.zeros((len(this_cluster), n_pc + 2)) data[:,2:] = pca_slices[this_cluster,:n_pc] data[:,0] = energy[this_cluster]/np.max(energy[this_cluster]) data[:,1] = np.abs(amplitudes[this_cluster])/np.max(np.abs(amplitudes[this_cluster])) # Cluster the data g = GaussianMixture(n_components = n_clusters, covariance_type = 'full', tol = thresh, max_iter = n_iter, n_init = n_restarts) g.fit(data) # Show the cluster plots if the solution converged if g.converged_: split_predictions = g.predict(data) x = np.arange(len(spike_waveforms[0])/10) + 1 for cluster in range(n_clusters): split_points = np.where(split_predictions == cluster)[0] # plt.figure(cluster) slices_dejittered = spike_waveforms[this_cluster, :] # Waveforms and times from the chosen cluster times_dejittered = spike_times[this_cluster] times_dejittered = times_dejittered[split_points] # Waveforms and times from the chosen split of the chosen cluster ISIs = np.ediff1d(np.sort(times_dejittered))/30.0 violations1 = 100.0*float(np.sum(ISIs < 1.0)/split_points.shape[0]) violations2 = 100.0*float(np.sum(ISIs < 2.0)/split_points.shape[0]) fig, ax = blech_waveforms_datashader.waveforms_datashader(slices_dejittered[split_points, :], x)
with open("_caption.pickle", 'rb') as f: captions_ids_train, captions_ids_test = pickle.load(f) # images_train = np.array(images_train) # images_test = np.array(images_test) ###################################################### ## GMM ## images_test = np.array(images_test) images_test = images_test.reshape((189, 64 * 64 * 3)) total_components = 50 gmm = GaussianMixture(n_components=total_components, covariance_type='diag', verbose=5, max_iter=500) gmm.fit(images_test) ####################################################### ## image interpolation ## save_dir = "checkpoint" net_rnn_name = os.path.join(save_dir, 'net_rnn.npz') net_cnn_name = os.path.join(save_dir, 'net_cnn.npz') net_g_name = os.path.join(save_dir, 'net_g.npz') net_d_name = os.path.join(save_dir, 'net_d.npz') ni = int(np.ceil(np.sqrt(batch_size))) t_real_image = tf.placeholder('float32', [batch_size, image_size, image_size, 3], name='real_image') t_wrong_image = tf.placeholder('float32',