def test_DPGMM(): # # random generator seed = 1337 rand_gen = numpy.random.RandomState(seed) verbose = True # # loading a very simple dataset dataset_name = 'nltcs' train, valid, test = dataset.load_train_val_test_csvs(dataset_name) # # this is the max number of clustering for a truncated DP n_components = 100 cov_type = 'diag' n_iters = 1000 # 'spherical', 'tied', 'diag', 'full'. Defaults to 'diag'. cov_type = 'diag' concentration = 1.0 # a higher alpha means more clusters # as the expected number of clusters is alpha*log(N). dpgmm_c = mixture.DPGMM(n_components=n_components, covariance_type=cov_type, random_state=rand_gen, n_iter=n_iters, alpha=concentration, verbose=verbose) # # fitting to training set fit_start_t = perf_counter() dpgmm_c.fit(train) fit_end_t = perf_counter() # # getting the cluster assignment pred_start_t = perf_counter() clustering = dpgmm_c.predict(train) pred_end_t = perf_counter() print('Clustering') print('for instances: ', clustering.shape[0]) print(clustering) print('smallest cluster', numpy.min(clustering)) print('biggest cluster', numpy.max(clustering)) print('clustering done in', (fit_end_t - fit_start_t), 'secs') print('prediction done in', (pred_end_t - pred_start_t), 'secs') # # predicting probabilities pred_start_t = perf_counter() clustering_p = dpgmm_c.predict_proba(train) pred_end_t = perf_counter() print('prediction done in', (pred_end_t - pred_start_t), 'secs') print(clustering_p.shape[0], clustering_p.shape[1])
def latent_cluster_estimate(SAMObject, n_components=10, X=None, plot=True, alpha=10, covariance_type='diag', which_indices=(0, 1)): """ Use Dirichlet Process GMMs to cluster the latent space by automatically estimating an effective number of clusters. ARG SAMObject: The SAMObject to operate on. ARG n_components: The number of DPGMM commponents to use (ie max number of clusters). Some components will switch off. ARG X: If None, we'll use the SAMObject's latent space, otherwise the provided one. ARG plot: Whether to plot the result or not. ARG alpha: The parameter for the stick-breaking process. In theory, large alpha encourages more clusters, although in practice I haven't seen such behaviour. ARG covariance_type: See DPGMM from scikit-learn. ARG which_indices: If plotting, which indices to plot. RETURN Y_: The cluster assignments for each component in the latent space. This is not (0,1,...,n_clusters), but instead it is (0,1,...,n_components), so that switched off components will not appear in Y_. """ from sklearn import mixture if X is None: X = SAMObject._get_latent() # Fit a Dirichlet process mixture of Gaussians using five components dpgmm = mixture.DPGMM(n_components=n_components, covariance_type=covariance_type, n_iter=5000, alpha=alpha) dpgmm.fit(X) Y_ = dpgmm.predict(X) if plot: from scipy import linalg import matplotlib as mpl import itertools color_iter = colors = cm.rainbow(np.linspace(0, 1, 20)) myperm = np.random.permutation(color_iter.shape[0]) color_iter = color_iter[myperm, :] marker_iter = itertools.cycle((',', '+', '.', 'o', '*', 'v', 'x', '>')) splot = pb.subplot(1, 1, 1) for i, (mean, covar, color, marker) in enumerate( zip(dpgmm.means_, dpgmm._get_covars(), color_iter, marker_iter)): # as the method will not use every component it has access to # unless it needs it, we shouldn't plot the redundant components. #if not np.any(Y_ == i): # continue pb.scatter(X[Y_ == i, which_indices[0]], X[Y_ == i, which_indices[1]], s=40, color=color, marker=marker) pb.legend(np.unique(Y_)) pb.show() pb.draw() pb.show() return Y_
def clustering_DPGMM(self, n_components, alpha): model = mixture.DPGMM(n_components=n_components, alpha=alpha, n_iter=1000) model.fit(self.embedding_) self.label = model.predict(self.embedding_) return self.label, model
def select(self): from sklearn import mixture X = self.input_mtrx[self.collection_ind()] self.est = mixture.DPGMM(n_components=3) self.est.fit(X) labels = self.est.predict(X) self.labels[self.collection_ind()] = labels
def cluster_changepoints_level1(self): print "Level1 : Clustering changepoints in Z(t)" if constants.REMOTE == 1: if self.fit_DPGMM: print "DPGMM L1 - start" # Previously, when L0 was GMM, alpha = 0.4 print "L1 ", str( len(self.list_of_cp) / constants.DPGMM_DIVISOR_L1), " ALPHA ", 10 dpgmm = mixture.DPGMM(n_components=int( len(self.list_of_cp) / 6), covariance_type='diag', n_iter=1000, alpha=10, thresh=1e-7) print "DPGMM L1 - end" if self.fit_GMM: gmm = mixture.GMM(n_components=self.n_components_L1, covariance_type='full', n_iter=1000, thresh=5e-5) print "GMM L1 - end" elif constants.REMOTE == 2: gmm = mixture.GMM(n_components=self.n_components_L1, covariance_type='full') else: gmm = mixture.GMM(n_components=self.n_components_L1, covariance_type='full') if self.fit_GMM: gmm.fit(self.change_pts_Z) Y_gmm = gmm.predict(self.change_pts_Z) Y = Y_gmm if self.fit_DPGMM: Y = [] i = 0 while True: print "In DPGMM Fit loop" dpgmm.fit(self.change_pts_Z) Y = dpgmm.predict(self.change_pts_Z) if len(set(Y)) > 1: break i += 1 if i > 100: break self.save_cluster_metrics(self.change_pts_Z, Y, 'level1') for i in range(len(Y)): label = constants.alphabet_map[Y[i] + 1] self.map_cp2level1[i] = label utils.dict_insert_list(label, i, self.map_level12cp) self.generate_l2_cluster_matrices()
def simple_stats_with_minutes(): x = numpy.loadtxt(open(FILENAME, 'rb'), delimiter=",", usecols=(7, 22, 23, 24, 25, 26, 27, 28), skiprows=1) dpgmm = mixture.DPGMM(n_iter=100, n_components=25) dpgmm.fit(x) return _output_results(dpgmm.predict(x))
def compute_similarity(F, bound_idxs, dirichlet=False, xmeans=False, k=5, offset=4): """Main function to compute the segment similarity of file file_struct. Parameters ---------- F: np.ndarray Matrix containing one feature vector per row. bound_idxs: np.ndarray Array with the indeces of the segment boundaries. dirichlet: boolean Whether to use the dirichlet estimator of the number of unique labels. xmeans: boolean Whether to use the xmeans estimator of the number of unique labels. k: int > 0 If the other two predictors are `False`, use fixed number of labels. offset: int >= 0 Number of frames to ignore from beginning and end of each segment. Returns ------- labels_est: np.ndarray Estimated labels, containing integer identifiers. """ # Get the feature segments feat_segments = get_feat_segments(F, bound_idxs) # Get the 2D-FMCs segments fmcs = feat_segments_to_2dfmc_max(feat_segments, offset) if len(fmcs) == 0: return np.arange(len(bound_idxs) - 1) # Compute the labels using kmeans if dirichlet: k_init = np.min([fmcs.shape[0], k]) # Only compute the dirichlet method if the fmc shape is small enough if fmcs.shape[1] > 500: labels_est = compute_labels_kmeans(fmcs, k=k) else: dpgmm = mixture.DPGMM(n_components=k_init, covariance_type='full') # dpgmm = mixture.VBGMM(n_components=k_init, covariance_type='full') dpgmm.fit(fmcs) k = len(dpgmm.means_) labels_est = dpgmm.predict(fmcs) # print("Estimated with Dirichlet Process:", k) if xmeans: xm = XMeans(fmcs, plot=False) k = xm.estimate_K_knee(th=0.01, maxK=8) labels_est = compute_labels_kmeans(fmcs, k=k) # print("Estimated with Xmeans:", k) else: labels_est, wfmcs = compute_labels_kmeans(fmcs, k=k) return labels_est, wfmcs
def fitIGMM(self,obs,IsPlot=0): """ Fitting the Infinite Gaussian Mixture Model and GMM where applicable Input Parameters ---------- obs: samples generated under the acqusition function by BGSS IsPlot: flag variable for visualization Returns ------- mean vector: mu_1,...mu_K """ if self.dim<=2: n_init_components=3 else: n_init_components=np.int(self.dim*1.1) dpgmm = mixture.DPGMM(n_components=n_init_components,covariance_type="full",min_covar=1e-3) dpgmm.fit(obs) # check if DPGMM fail, then use GMM. mydist=euclidean_distances(dpgmm.means_,dpgmm.means_) np.fill_diagonal(mydist,99) if dpgmm.converged_ is False or np.min(mydist)<(0.01*self.dim): dpgmm = mixture.GMM(n_components=n_init_components,covariance_type="full",min_covar=1e-5) dpgmm.fit(obs) # truncated for variational inference weight=dpgmm.weights_ weight_sorted=np.sort(weight) weight_sorted=weight_sorted[::-1] temp_cumsum=np.cumsum(weight_sorted) cutpoint=0 for idx,val in enumerate(temp_cumsum): if val>0.7: cutpoint=weight_sorted[idx] break ClusterIndex=[idx for idx,val in enumerate(dpgmm.weights_) if val>=cutpoint] myMeans=dpgmm.means_[ClusterIndex] #dpgmm.means_=dpgmm.means_[ClusterIndex] dpgmm.truncated_means_=dpgmm.means_[ClusterIndex] if IsPlot==1 and self.dim<=2: visualization.plot_histogram(self,obs) visualization.plot_mixturemodel(dpgmm,self,obs) new_X=myMeans.reshape((len(ClusterIndex), -1)) new_X=new_X.tolist() return new_X
def clusterize_dirichlet(*args): """ Clustering and plotting with Dirichlet process GMM """ ### Clustering try: from sklearn import mixture from scipy import linalg import pylab as pl import matplotlib as mpl from sklearn.decomposition import PCA except: print "You need SciPy and scikit-learn" sys.exit(-1) models = [] for arg in args: dpgmm = mixture.DPGMM(n_components=15, cvtype='full') dpgmm.fit(arg) print dpgmm models.append(copy.deepcopy(dpgmm)) print raw_input("any key to pass") ### Plotting color_iter = itertools.cycle(['r', 'g', 'b', 'c', 'm']) for i, (clf, data) in enumerate(zip(models, args)): pca = PCA(n_components=2) X_r = pca.fit(data).transform(data) splot = pl.subplot(len(args), 1, 1 + i) pl.scatter(X_r[:, 0], X_r[:, 1]) #pl.title('PCA of unit types / numbers') Y_ = clf.predict(data) for i, (mean, covar, color) in enumerate(zip(clf.means, clf.covars, color_iter)): v, w = linalg.eigh(covar) u = w[0] / linalg.norm(w[0]) # as the DP will not use every component it has access to # unless it needs it, we shouldn't plot the redundant # components. if not np.any(Y_ == i): continue pl.scatter(data[Y_ == i, 0], data[Y_ == i, 1], .8, color=color) # Plot an ellipse to show the Gaussian component angle = np.arctan(u[1] / u[0]) angle = 180 * angle / np.pi # convert to degrees ell = mpl.patches.Ellipse(mean, v[0], v[1], 180 + angle, color=color) ell.set_clip_box(splot.bbox) ell.set_alpha(0.5) splot.add_artist(ell) pl.xlim(0.0, 1.0) pl.ylim(0.0, 1.0) pl.xticks(()) pl.yticks(()) pl.title("Dirichlet process GMM") pl.show()
def advanced_stats_only(): x = numpy.loadtxt(open(FILENAME, 'rb'), delimiter=",", usecols=(7, 11, 12, 14, 15, 17, 18, 20, 21, 23, 24, 25, 26, 27), skiprows=1) dpgmm = mixture.DPGMM(n_iter=100, n_components=25, alpha=1) dpgmm.fit(x) return create_player_groups(dpgmm.predict(x))
def all_relevant_stats(): x = numpy.loadtxt(open(FILENAME, 'rb'), delimiter=",", usecols=(7, 8, 9, 11, 12, 14, 15, 17, 18, 20, 21, 22, 23, 24, 25, 26, 27, 28), skiprows=1) dpgmm = mixture.DPGMM(n_iter=100, n_components=25) dpgmm.fit(x) return _output_results(dpgmm.predict(x))
def tune(self, pa=None, comm=None, divergence_threshold=1e10, verbose=0): """ Tune the step... """ if pa is None: raise RuntimeError('This step method works only in pysmc.') ac = self.get_acceptance_rate(comm=comm) if ac == -1: return False self.reset_counters() if (self._tuned and ac >= self.adapt_lower_ac_rate and ac <= self.adapt_upper_ac_rate): return False use_mpi = comm is not None if use_mpi: rank = comm.Get_rank() size = comm.Get_size() else: rank = 0 size = 1 pa = pa.gather() # Only the root should run train the mixture if rank == 0: pa.resample() data = [ pa.particles[i]['stochastics'][self.stochastic.__name__] for i in range(pa.num_particles) ] data = np.array(data, dtype='float') if data.ndim == 1: data = np.atleast_2d(data).T self._gmm = mixture.DPGMM(n_components=self.n_components, covariance_type=self.covariance_type, n_iter=self.n_iter) self.gmm.fit(data) Y_ = self.gmm.predict(data) n_comp = 0 for i in range(self.n_components): if np.any(Y_ == i): n_comp += 1 self._gmm = mixture.GMM(n_components=n_comp, covariance_type=self.covariance_type, n_iter=self.n_iter) self.gmm.fit(data) Y_ = self.gmm.predict(data) if verbose >= 2: for i, (mean, covar) in enumerate( zip(self.gmm.means_, self.gmm._get_covars())): if not np.any(Y_ == i): continue print(('\n', mean, covar)) if use_mpi: self._gmm = comm.bcast(self._gmm) self.gmm.covars_ = self.gmm._get_covars() self._tuned = True return True
def do_dpgmm(mat, n_components): log.info("Using the Dirichlet Process Gaussian Mixture Model") log.info("Design matrix size %s. Requested components: %s" ,mat.shape, n_components) t0 = time.time() dpgmm = mixture.DPGMM(n_components=n_components, covariance_type='tied', alpha=0.5) dpgmm.fit(mat) labels = dpgmm.predict(mat) logprobs, responsibilities = dpgmm.eval(mat) tf = time.time() log.info("Time: %s s.",tf-t0) return logprobs, responsibilities
def build_dpgmm(k): print 'building ' + gmm_type + ' Dirichlet GMM with k<=' + str( k) + ' components' gmm = mixture.DPGMM(n_components=k, covariance_type=gmm_type, alpha=1, thresh=0.001, min_covar=0.001, n_iter=500, params='wmc', init_params='wmc') return gmm
def gmm(input_file,Output): lvltrace.lvltrace("LVLEntree dans gmm unsupervised") print "#########################################################################################################\n" print "GMM" print "#########################################################################################################\n" ncol=tools.file_col_coma(input_file) data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1)) X = data[:,1:] y = data[:,0] n_samples, n_features = X.shape # Fit a mixture of gaussians with EM using five components gmm = mixture.GMM(n_components=5, covariance_type='spherical', init_params = 'wmc') gmm.fit(X) # Fit a dirichlet process mixture of gaussians using five components dpgmm = mixture.DPGMM(n_components=5, covariance_type='spherical',init_params = 'wmc') dpgmm.fit(X) color_iter = itertools.cycle(['r', 'g', 'b', 'c', 'm', 'b','g','r','c','m','y','k','b','g','r','c','m','y','k','b','g','r','c','m','y','k','b','g','r','c','m','y','k']) for i, (clf, title) in enumerate([(gmm, 'GMM'), (dpgmm, 'Dirichlet Process GMM')]): splot = pl.subplot(2, 1, 1 + i) Y_ = clf.predict(X) for i, (mean, covar, color) in enumerate(zip( clf.means_, clf._get_covars(), color_iter)): v, w = linalg.eigh(covar) u = w[0] / linalg.norm(w[0]) # as the DP will not use every component it has access to # unless it needs it, we shouldn't plot the redundant # components. if not np.any(Y_ == i): continue pl.scatter(X[Y_ == i, 0], X[Y_ == i, 1], .8, color=color) # Plot an ellipse to show the Gaussian component angle = np.arctan(u[1] / u[0]) angle = 180 * angle / np.pi # convert to degrees ell = mpl.patches.Ellipse(mean, v[0], v[1], 180 + angle, color=color) ell.set_clip_box(splot.bbox) ell.set_alpha(0.5) splot.add_artist(ell) pl.xticks(()) pl.yticks(()) pl.title(title) save = Output + "gmm.png" plt.savefig(save) lvltrace.lvltrace("LVLSortie dans gmm unsupervised")
def cluster_changepoints(self): """ Clusters changepoints specified in self.list_of_cp. """ print "Clustering changepoints..." print "L1 ", str(len(self.list_of_cp)/constants.DPGMM_DIVISOR_L1)," ALPHA: ", self.ALPHA_L1 if constants.REMOTE == 1: if self.fit_DPGMM: dpgmm = mixture.DPGMM(n_components = int(len(self.list_of_cp)/constants.DPGMM_DIVISOR_L1), covariance_type='diag', n_iter = 10000, alpha = self.ALPHA_L1, thresh= 1e-4) if self.fit_GMM: gmm = mixture.GMM(n_components = self.n_components_L1, covariance_type='full', n_iter=5000, thresh = 0.01) elif constants.REMOTE == 2: gmm = mixture.GMM(n_components = self.n_components_L1, covariance_type='full', thresh = 0.01) else: gmm = mixture.GMM(n_components = self.n_components_L1, covariance_type='full') if self.fit_GMM: gmm.fit(self.changepoints) predictions_gmm = gmm.predict(self.changepoints) print "L1: Clusters in GMM",len(set(predictions_gmm)) predictions = predictions_gmm if self.fit_DPGMM: predictions = [] while True: print "Inside loop" dpgmm.fit(self.changepoints) predictions = dpgmm.predict(self.changepoints) if len(set(predictions)) > 1: break print "L1: Clusters in DP-GMM", len(set(predictions)) for i in range(len(predictions)): label = constants.alphabet_map[predictions[i] + 1] self.map_cp2cluster[i] = label utils.dict_insert_list(label, i, self.map_level1_cp) demonstration = self.map_cp2demonstrations[i] frm = self.map_cp2frm[i] try: surgeme = self.map_frm2surgeme[demonstration][frm] except KeyError as e: print e sys.exit() utils.print_and_write(("%3d %s %s %3d %3d\n" % (i, label, demonstration, frm, surgeme)), self.log)
def fit_mixture(glcm, n_components, max_components=4, type='gmm'): print 'preparing data ...', data = data_from_glcm(glcm) print 'done' print 'fitting %s ...' % type, if type == 'dpgmm': gmm = mixture.DPGMM(n_components=n_components, covariance_type='spherical', alpha=0.1) elif type == 'gmm': if n_components == 0: aics = [] n_comps = range(1, max_components + 1) print 'searching for optimal number of components: ', for i in n_comps: gmm = mixture.GMM(n_components=i, covariance_type='spherical') gmm.fit(data) aic = gmm.aic(data) print '(%i, %.2f)' % (i, aic), aics.append(aic) best = n_comps[np.argmin(np.array(aics))] print ' -> %i' % best gmm = mixture.GMM(n_components=best, covariance_type='spherical') else: gmm = mixture.GMM(n_components=n_components, covariance_type='spherical') else: raise ValueError('Wrong micture type. Allowed values: dpgmm, gmm') gmm.fit(data) print 'done' print 'means:' print gmm.means_ print 'predicting %s ...' % type, y_pred = gmm.predict(data) glcm_labs = np.zeros(glcm.shape) for x, y in zip(data, y_pred): glcm_labs[tuple(x)] = y + 1 print 'done' plt.figure() plt.subplot(121), plt.imshow(glcm, 'gray', interpolation='nearest') plt.subplot(122), plt.imshow(glcm_labs, 'jet', interpolation='nearest') plt.show()
def classification_dp_gmm(sample=700): train_data_set, train_labels, test_data_set, test_labels = generate_train_data( sample) # Fit a Dirichlet process mixture of Gaussians using five components # dpgmm = mixture.DPGMM(n_components=5, covariance_type='full', n_iter=20) dpgmm = mixture.DPGMM(n_components=2, covariance_type='diag', n_iter=10) # dpgmm = mixture.VBGMM(n_components=2, covariance_type='diag', n_iter=30) dpgmm.fit(train_data_set) # print 'train accuracy' y_train_pred = dpgmm.predict(train_data_set) train_accuracy = np.mean(y_train_pred.ravel() == train_labels.ravel()) print train_accuracy, print ',', # print 'test accuracy' y_test_pred = dpgmm.predict(test_data_set) test_accuracy = np.mean(y_test_pred.ravel() == test_labels.ravel()) print test_accuracy
def compute_similarity(PCP, bound_idxs, dirichlet=False, xmeans=False, k=5): """Main function to compute the segment similarity of file file_struct.""" # Get PCP segments pcp_segments = get_pcp_segments(PCP, bound_idxs) # Get the 2d-FMCs segments fmcs = pcp_segments_to_2dfmc_max(pcp_segments) if len(fmcs) == 0: return np.arange(len(bound_idxs) - 1) # Compute the labels using kmeans if dirichlet: k_init = np.min([fmcs.shape[0], k]) # Only compute the dirichlet method if the fmc shape is small enough if fmcs.shape[1] > 500: labels_est = compute_labels_kmeans(fmcs, k=k) else: dpgmm = mixture.DPGMM(n_components=k_init, covariance_type='full') #dpgmm = mixture.VBGMM(n_components=k_init, covariance_type='full') dpgmm.fit(fmcs) k = len(dpgmm.means_) labels_est = dpgmm.predict(fmcs) #print "Estimated with Dirichlet Process:", k if xmeans: xm = XMeans(fmcs, plot=False) k = xm.estimate_K_knee(th=0.01, maxK=8) labels_est = compute_labels_kmeans(fmcs, k=k) #print "Estimated with Xmeans:", k else: labels_est = compute_labels_kmeans(fmcs, k=k) # Plot results #plot_pcp_wgt(PCP, bound_idxs) return labels_est
import numpy as np from scipy import linalg import matplotlib.pyplot as plt import matplotlib as mpl from sklearn import mixture import scipy.io as sio np.set_printoptions(threshold=np.nan) mat_contents = sio.loadmat('Euclid.mat') #print mat_contents.keys() #print mat_contents['Euclid_matrix'].shape row = mat_contents['Euclid_matrix'][:,100] dpgmm = mixture.DPGMM(n_components=6, covariance_type='diag', alpha=10, n_iter=100,verbose=1, thresh=0.0001) dpgmm.fit(row) Y = dpgmm.predict(row) Y_unique = np.unique(Y) for point in Y_unique: print point print np.mean(row[Y == point]) print dpgmm.get_params() print dpgmm.n_components print dpgmm.precs_ #print Y
from sklearn import mixture import itertools color_iter = itertools.cycle(['r', 'g', 'b', 'c', 'm']) import numpy as np from scipy import linalg import matplotlib.pyplot as plt import matplotlib as mpl FILENAME = "mcdonalds-normalized-data-clean.tsv" # Note: you'll have to remove the last "name" column in the file (or # some other such thing), so that all the columns are numeric. X = numpy.loadtxt(open(FILENAME, "rb"), delimiter="\t", skiprows=1) dpgmm = mixture.DPGMM(n_components=25) dpgmm.fit(X) clusters = dpgmm.predict(X) classes = [[] for i in range(25)] for i, c in enumerate(clusters): classes[c].append(i) with open('mcdonalds-normalized-data-names.tsv') as f: names = f.read().split('\n')[1:] with open('chen_out', 'w') as f: f.write('\n\n\n'.join('\n'.join(names[i] for i in cc) for cc in classes)) for i, (clf, title) in enumerate([(dpgmm, 'Dirichlet Process GMM')]): splot = plt.subplot(1, 1, 1 + i) Y_ = clf.predict(X)
# Number of samples per component n_samples = 500 # Generate random sample, two components np.random.seed(0) C = np.array([[0., -0.1], [1.7, .4]]) X = np.r_[np.dot(np.random.randn(n_samples, 2), C), .7 * np.random.randn(n_samples, 2) + np.array([-6, 3])] # Fit a mixture of gaussians with EM using five components gmm = mixture.GMM(n_components=5, cvtype='full') gmm.fit(X) # Fit a dirichlet process mixture of gaussians using five components dpgmm = mixture.DPGMM(n_components=5, cvtype='full') dpgmm.fit(X) color_iter = itertools.cycle(['r', 'g', 'b', 'c', 'm']) for i, (clf, title) in enumerate([(gmm, 'GMM'), (dpgmm, 'Dirichlet Process GMM')]): splot = pl.subplot(2, 1, 1 + i) Y_ = clf.predict(X) for i, (mean, covar, color) in enumerate(zip(clf.means, clf.covars, color_iter)): v, w = linalg.eigh(covar) u = w[0] / linalg.norm(w[0]) # as the DP will not use every component it has access to # unless it needs it, we shouldn't plot the redundant # components.
def cluster_changepoints_level2(self): print "Level2 : Clustering changepoints in W(t)" mkdir_path = constants.PATH_TO_CLUSTERING_RESULTS + self.trial os.mkdir(mkdir_path) # To put frames of milestones os.mkdir(mkdir_path + "/" + "milestones") self.file = open(mkdir_path + "/" + self.trial + "clustering.txt", "wb") self.metrics_picklefile = mkdir_path + "/" + self.trial + "metrics.p" line = self.featfile + "\n\n" self.file.write(line) line = "L1 Cluster L2 Cluster Demonstration Frame# CP# Surgeme\n" self.file.write(line) print "---Checking data representativeness ---" for key in sorted(self.map_level12cp.keys()): mkdir_l1_cluster = mkdir_path + "/" + key list_of_cp_key = self.map_level12cp[key] if self.check_pruning_condition(list_of_cp_key): continue os.mkdir(mkdir_l1_cluster) print "--- ---" for key in sorted(self.map_level12cp.keys()): matrix = self.l2_cluster_matrices[key] list_of_cp_key = self.map_level12cp[key] if self.check_pruning_condition(list_of_cp_key): self.pruned_L1_clusters.append(key) del self.map_level12cp[key] print "Pruned" for pruned_cp in list_of_cp_key: # print "Pruned: " + str(key) + " " + str(pruned_cp) + " " + str(self.map_cp2demonstrations[pruned_cp]) self.list_of_cp.remove(pruned_cp) continue if constants.REMOTE == 1: gmm = mixture.GMM(n_components=min(self.n_components_L2, matrix.shape[0]), covariance_type='full', n_iter=10000, thresh=5e-5) # Alpha didn't change between using GMM or DP-GMM for L0 print "L2 ", str(int(np.ceil(len(list_of_cp_key) / 2.0))), " ALPHA ", 1 dpgmm = mixture.DPGMM(n_components=int( np.ceil(len(list_of_cp_key) / 2.0)), covariance_type='diag', n_iter=1000, alpha=1, thresh=1e-7) if constants.REMOTE == 2: gmm = mixture.GMM(n_components=self.n_components_L2, covariance_type='full') else: gmm = mixture.GMM(n_components=self.n_components_L2, covariance_type='full') try: if self.fit_GMM: gmm.fit(matrix) Y = gmm.predict(matrix) if self.fit_DPGMM: dpgmm.fit(matrix) Y = dpgmm.predict(matrix) except ValueError as e: print "ValueError" continue self.save_cluster_metrics(matrix, Y, 'level2_' + str(key), level2_mode=True) for i in range(len(Y)): cp = list_of_cp_key[i] l1_cluster = key l2_cluster = Y[i] milestone = l1_cluster + "_" + str(l2_cluster) demonstration = self.map_cp2demonstrations[cp] try: frm = self.map_cp2frm[cp] surgeme = self.map_frm2surgeme[demonstration][frm] except KeyError as e: print e sys.exit() self.map_cp2milestones[cp] = milestone self.file.write( "%s %3d %s %3d %3d %3d\n" % (l1_cluster, l2_cluster, demonstration, frm, cp, surgeme)) if constants.REMOTE == 0: self.copy_frames(demonstration, frm, str(l1_cluster), str(l2_cluster), surgeme) if constants.REMOTE == 0: self.copy_milestone_frames(matrix, list_of_cp_key, gmm)
def generate_change_points_2(self): """ Generates changespoints by clustering across demonstrations. """ cp_index = 0 i = 0 big_N = None map_index2demonstration = {} map_index2frm = {} size_of_X = self.data_X_size[self.list_of_demonstrations[0]] for demonstration in self.list_of_demonstrations: print demonstration N = self.data_N[demonstration] start, end = utils.get_start_end_annotations( constants.PATH_TO_DATA + constants.ANNOTATIONS_FOLDER + demonstration + "_" + constants.CAMERA + ".p") for j in range(N.shape[0]): map_index2demonstration[i] = demonstration map_index2frm[i] = start + j * self.sr i += 1 big_N = utils.safe_concatenate(big_N, N) print "Generated big_N" if constants.REMOTE == 1: if self.fit_GMM: gmm = mixture.GMM(n_components=self.n_components_cp, covariance_type='full', thresh=0.01) if self.fit_DPGMM: # dpgmm = mixture.DPGMM(n_components = 100, covariance_type='diag', n_iter = 10000, alpha = 100, thresh= 2e-4) #DO NOT FIDDLE WITH PARAMS WITHOUT CONSENT :) avg_len = int(big_N.shape[0] / len(self.list_of_demonstrations)) DP_GMM_COMPONENTS = int( avg_len / constants.DPGMM_DIVISOR ) #tuned with suturing experts only for kinematics print "L0 ", DP_GMM_COMPONENTS, "ALPHA: ", constants.ALPHA_ZW_CP dpgmm = mixture.DPGMM(n_components=DP_GMM_COMPONENTS, covariance_type='diag', n_iter=1000, alpha=constants.ALPHA_ZW_CP, thresh=1e-7) elif constants.REMOTE == 2: gmm = mixture.GMM(n_components=self.n_components_cp, covariance_type='full') else: gmm = mixture.GMM(n_components=self.n_components_cp, covariance_type='full') if self.fit_GMM: start = time.time() gmm.fit(big_N) end = time.time() "GMM time taken: ", str(end - start) Y_gmm = gmm.predict(big_N) print "L0: Clusters in GMM", len(set(Y_gmm)) Y = Y_gmm if self.fit_DPGMM: start = time.time() dpgmm.fit(big_N) end = time.time() "DP-GMM time taken: ", str(end - start) Y_dpgmm = dpgmm.predict(big_N) Y = Y_dpgmm print "L0: Clusters in DP-GMM", len(set(Y_dpgmm)) for w in range(len(Y) - 1): if Y[w] != Y[w + 1]: change_pt = big_N[w][:size_of_X] self.append_cp_array(change_pt) self.map_cp2frm[cp_index] = map_index2frm[w] self.map_cp2demonstrations[cp_index] = map_index2demonstration[ w] self.list_of_cp.append(cp_index) cp_index += 1 print "Done with generating change points", len(self.list_of_cp)
np.random.seed(0) X = np.zeros((n_samples, 2)) step = 4 * np.pi / n_samples for i in xrange(X.shape[0]): x = i * step - 6 X[i, 0] = x + np.random.normal(0, 0.1) X[i, 1] = 3 * (np.sin(x) + np.random.normal(0, .2)) color_iter = itertools.cycle(['r', 'g', 'b', 'c', 'm']) for i, (clf, title) in enumerate([ (mixture.GMM(n_components=10, covariance_type='full', n_iter=100), "Expectation-maximization"), (mixture.DPGMM(n_components=10, covariance_type='full', alpha=0.01, n_iter=100), "Dirichlet Process,alpha=0.01"), (mixture.DPGMM(n_components=10, covariance_type='diag', alpha=100., n_iter=100), "Dirichlet Process,alpha=100.") ]): clf.fit(X) splot = pl.subplot(3, 1, 1 + i) Y_ = clf.predict(X) for i, (mean, covar, color) in enumerate(zip(clf.means_, clf._get_covars(), color_iter)): v, w = linalg.eigh(covar) u = w[0] / linalg.norm(w[0])
labels_dist[j] += 1 bic.append(gmm.bic(data_use)) if bic[-1] < lowest_bic: lowest_bic = bic[-1] best_gmm = gmm print n_components,"labels:",labels_unique,"lables_dist",labels_dist clf = best_gmm for i in range(len(bic)): print([i,bic[i],logl[i]]) if method == 'dpgmm': alpha_range = [0.001,0.01,0.1,1.,10.,100.,1000.,1e6] n_clusters_max = 100 for alpha in alpha_range: # Fit a mixuture of gaussians with Dirichlet Process Mixture dpgmm = mixture.DPGMM(n_components=n_clusters_max,\ covariance_type='full',alpha=alpha,n_iter=1000) dpgmm.fit(data_use) labels_predict = dpgmm.predict(data_use) labels_unique = np.unique(labels_predict) # Counting the number of samples belonging to each cluster labels_dist = [0]*len(labels_unique) for i in range(len(labels_predict)): for j in range(len(labels_unique)): if labels_predict[i] == labels_unique[j]: labels_dist[j] += 1 print(["alpha: ",alpha,"labels: ",labels_unique,"labels_dist: ",\ labels_dist]) print(["Upper bound for the number of clusters: ",n_clusters_max])
# Number of samples per component n_samples = 500 # Generate random sample, two components np.random.seed(0) C = np.array([[0., -0.1], [1.7, .4]]) X = np.r_[np.dot(np.random.randn(n_samples, 2), C), .7 * np.random.randn(n_samples, 2) + np.array([-6, 3])] # Fit a mixture of Gaussians with EM using five components gmm = mixture.GMM(n_components=5, covariance_type='full') gmm.fit(X) # Fit a Dirichlet process mixture of Gaussians using five components dpgmm = mixture.DPGMM(n_components=5, covariance_type='full') dpgmm.fit(X) color_iter = itertools.cycle( ['navy', 'c', 'cornflowerblue', 'gold', 'darkorange']) for i, (clf, title) in enumerate([(gmm, 'GMM'), (dpgmm, 'Dirichlet Process GMM')]): splot = plt.subplot(2, 1, 1 + i) Y_ = clf.predict(X) for i, (mean, covar, color) in enumerate(zip(clf.means_, clf._get_covars(), color_iter)): v, w = linalg.eigh(covar) u = w[0] / linalg.norm(w[0]) # as the DP will not use every component it has access to
lowest_bic = np.infty bic = [] n_components_range = range(1, 3) cv_types = ['spherical'] for cv_type in cv_types: for n_components in n_components_range: # Fit a mixture of Gaussians with EM gmm = mixture.GMM(n_components=n_components, covariance_type=cv_type) gmm.fit(X) bic.append(gmm.bic(X)) if bic[-1] < lowest_bic: lowest_bic = bic[-1] best_gmm = gmm best_gmm = mixture.DPGMM(n_components=4) best_gmm.fit(X) elapsed = int(round(time.time() * 1000)) - currTime print best_gmm print "ELAPSED: " + str(elapsed) bic = np.array(bic) color_iter = itertools.cycle(['k', 'r', 'g', 'b', 'c', 'm', 'y']) clf = best_gmm bars = [] # Plot the BIC scores spl = plt.subplot(2, 1, 1) for i, (cv_type, color) in enumerate(zip(cv_types, color_iter)):
def glcm_dpgmm(img): # deriving glcm mask = (img > 0) * (img < 255) glcm = tools.graycomatrix_3D(img, mask=mask) # inds = tuple(inds.flatten()) # processing glcm # glcm_gc = skimor.closing(glcm, selem=skimor.disk(1)) # glcm_go = skimor.opening(glcm, selem=skimor.disk(1)) # plt.figure() # plt.subplot(131), plt.imshow(glcm, 'gray', interpolation='nearest'), plt.title('glcm') # plt.subplot(132), plt.imshow(glcm_gc, 'gray', interpolation='nearest'), plt.title('glcm_gc') # plt.subplot(133), plt.imshow(glcm_go, 'gray', interpolation='nearest'), plt.title('glcm_go') # thresholding glcm c_t = 4 thresh = c_t * np.mean(glcm) glcm_t = glcm > thresh glcm_to = skimor.binary_closing(glcm_t, selem=skimor.disk(3)) glcm_to = skimor.binary_opening(glcm_to, selem=skimor.disk(3)) # tools.blob_from_gcm(glcm_to, img, return_rvs=True, show=True, show_now=False) # # labs_im, num = skimea.label(glcm_to, return_num=True) # # labels = np.unique(labs_im)[1:] # for l in labels: # tmp = glcm * (labs_im == l) # fit_mixture(tmp, n_components=0, type='gmm') # syntetic glcm # glcm = np.array([[0,1,1,2,0,0,0,0], # [1,2,2,3,1,0,0,1], # [1,3,4,2,1,0,0,0], # [0,1,3,1,0,0,0,1], # [1,0,0,0,0,0,1,3], # [0,2,0,0,0,2,2,1], # [0,0,0,0,1,3,4,0], # [0,0,0,0,1,2,0,0]]) # dpgmm glcm_o = glcm.copy() # glcm = glcm_go * glcm_to # glcm = glcm_go # glcm = glcm_gc glcm = glcm * glcm_to data = data_from_glcm(glcm) # fitting DPGMM # print 'fitting DPGMM ...' # types = ['spherical', 'tied', 'diag', 'full'] # n_comps = range(2, 11) # # n_comps = range(2, 4) # aics = np.zeros((len(types), len(n_comps))) # bics = np.zeros((len(types), len(n_comps))) # scores = np.zeros((len(types), len(n_comps))) # for i, type in enumerate(types): # print '\nTYPE:', type # for j, n in enumerate(n_comps): # # dpgmm = mixture.DPGMM(n_components=6, covariance_type='tied', alpha=0.100) # dpgmm = mixture.GMM(n_components=n, covariance_type=type) # dpgmm.fit(data) # aic = dpgmm.aic(data) # bic = dpgmm.bic(data) # score = dpgmm.score(data).mean() # # aics.append(aic) # # bics.append(bic) # # scores.append(score) # aics[i, j] = aic # bics[i, j] = bic # scores[i, j] = score # print 'n_comps=%i, score=%.2f, aic=%.2f, bic=%.2f' % (n, score, aic, bic) # # plt.figure() # color_iter = itertools.cycle(['r', 'g', 'b', 'c', 'm', 'y', 'k']) # for aic, color in zip(aics, color_iter): # plt.plot(n_comps, aic, color + '-') # plt.legend(types) # plt.title('aic') # # plt.figure() # color_iter = itertools.cycle(['r', 'g', 'b', 'c', 'm', 'y', 'k']) # for bic, color in zip(bics, color_iter): # plt.plot(n_comps, bic, color + '-') # plt.legend(types) # plt.title('bic') # # plt.figure() # color_iter = itertools.cycle(['r', 'g', 'b', 'c', 'm', 'y', 'k']) # for score, color in zip(scores, color_iter): # plt.plot(n_comps, score, color + '-') # plt.legend(types) # plt.title('scores') print 'fitting DPGMM ...', # dpgmm = mixture.GMM(n_components=3, covariance_type='tied') dpgmm = mixture.DPGMM(n_components=6, covariance_type='tied', alpha=1.) dpgmm.fit(data) print 'done' print 'means:' print dpgmm.means_ # dpgmm = mixture.GMM(n_components=3, covariance_type='tied') # dpgmm.fit(data) # print 'n_comps=3, score=%.2f, aic=%.2f, bic=%.2f' % (dpgmm.score(data).mean(), dpgmm.aic(data), dpgmm.bic(data)) # dpgmm = mixture.GMM(n_components=4, covariance_type='tied') # dpgmm.fit(data) # print 'n_comps=4, score=%.2f, aic=%.2f, bic=%.2f' % (dpgmm.score(data).mean(), dpgmm.aic(data), dpgmm.bic(data)) # dpgmm = mixture.GMM(n_components=5, covariance_type='tied') # dpgmm.fit(data) # print 'n_comps=5, score=%.2f, aic=%.2f, bic=%.2f' % (dpgmm.score(data).mean(), dpgmm.aic(data), dpgmm.bic(data)) # predicting DPGMM print 'predicting DPGMM ...', data = data_from_glcm(glcm_o) y_pred = dpgmm.predict(data) glcm_labs = np.zeros(glcm.shape, dtype=np.uint8) for x, y in zip(data, y_pred): glcm_labs[tuple(x)] = int(y + 1) print 'done' # glcm_labs += 10 inds = np.argsort(dpgmm.means_.mean(axis=1)) glcm_labs2 = glcm_labs.copy() for i, l in enumerate(inds): glcm_labs2 = np.where(glcm_labs == l + 1, i + 1, glcm_labs2) glcm_labs = glcm_labs2 # glcm_labs3 = inds[glcm_labs.flatten()].reshape(glcm_labs.shape) # plt.figure() # plt.subplot(121), plt.imshow(glcm_labs) # plt.subplot(122), plt.imshow(glcm_labs2) # plt.show() labint = dpgmm.predict(np.vstack((range(0, 256), range(0, 256))).T) labim = labint[img.flatten()].reshape(img.shape) # labim += 10 labim2 = labim.copy() for i, l in enumerate(inds): labim2 = np.where(labim == l, i + 1, labim2) labim = labim2 # labim3 = inds[labim.flatten()].reshape(labim.shape) # plt.figure() # plt.subplot(121), plt.imshow(labim) # plt.subplot(122), plt.imshow(labim2) # # plt.subplot(133), plt.imshow(labim3) # plt.show() labim_f = scindifil.median_filter(labim, size=3) plt.figure() plt.subplot(131), plt.imshow(img, 'gray', interpolation='nearest'), plt.axis('off') plt.subplot(132), plt.imshow(labim, 'jet', interpolation='nearest', vmin=0), plt.axis('off') plt.subplot(133), plt.imshow(labim_f, 'jet', interpolation='nearest', vmin=0), plt.axis('off') plt.figure() plt.subplot(121), plt.imshow(glcm_o, 'jet', interpolation='nearest', vmin=0), plt.axis('off') for c in dpgmm.means_: plt.plot(c[0], c[1], 'o', markerfacecolor='w', markeredgecolor='k', markersize=12) plt.subplot(122), plt.imshow(glcm_labs, 'jet', interpolation='nearest', vmin=0), plt.axis('off') for c in dpgmm.means_: plt.plot(c[0], c[1], 'o', markerfacecolor='w', markeredgecolor='k', markersize=12) plt.show()
X = np.zeros((n_samples, 2)) step = 4 * np.pi / n_samples for i in xrange(X.shape[0]): x = i * step - 6 X[i, 0] = x + np.random.normal(0, 0.1) X[i, 1] = 3 * (np.sin(x) + np.random.normal(0, .2)) color_iter = itertools.cycle(['r', 'g', 'b', 'c', 'm']) for i, (clf, title) in enumerate([ (mixture.GMM(n_components=10, covariance_type='full'), \ "Expectation-maximization"), (mixture.DPGMM(n_components=10, covariance_type='full', alpha=0.01), "Dirichlet Process,alpha=0.01"), (mixture.DPGMM(n_components=10, covariance_type='diag', alpha=100.), "Dirichlet Process,alpha=100.") ]): clf.fit(X, n_iter=100) splot = pl.subplot(3, 1, 1 + i) Y_ = clf.predict(X) for i, (mean, covar, color) in enumerate(zip( clf.means_, clf._get_covars(), color_iter)): v, w = linalg.eigh(covar) u = w[0] / linalg.norm(w[0]) # as the DP will not use every component it has access to # unless it needs it, we shouldn't plot the redundant # components.