def demo(): np.random.seed(1) gmm = GMM(3, n_iter=1) gmm.means_ = np.array([[-1], [0], [3]]) gmm.covars_ = np.array([[1.5], [1], [0.5]]) ** 2 gmm.weights_ = np.array([0.3, 0.5, 0.2]) return gmm.sample(1000)
def main(): pi = np.array([0.3, 0.5, 0.2]) mu = np.array([[1,1], [-1,-1], [-1,1]])*3 sigma = np.array([ [[1,0], [0,1]], [[2,0], [0,2]], [[0.5,0], [0, 0.5]], ]) X, C = generate_data(pi, mu, sigma, 1000) plt.scatter(X[:,0], X[:,1], c=C, s=100, alpha=0.5) plt.show() # sklearn gmm = GMM(n_components=3, covariance_type='full') gmm.fit(X) print "pi:", gmm.weights_ print "mu:", gmm.means_ print "sigma:", gmm.covars_ pi2, mu2, sigma2, L = expectation_maximization(X, len(pi)) print "pi:", pi2 print "mu:", mu2 print "sigma:", sigma2 plt.plot(L) plt.show()
def fit_gmix(data, ngauss, n_iter, min_covar=MIN_COVAR): """ gtot, T, flux etatot, log10(T), log10(flux) data is shape [npoints, ndim] """ from sklearn.mixture import GMM print("ngauss: ",ngauss) print("n_iter: ",n_iter) print("min_covar:",min_covar) gmm=GMM(n_components=ngauss, n_iter=n_iter, min_covar=min_covar, covariance_type='full') gmm.fit(data) if not gmm.converged_: print("DID NOT CONVERGE") return gmm
def main(): optparser = OptionParser() optparser.add_option('-e', '--key_embedder', action='store', type = 'str', dest='key_embedder') opts, args = optparser.parse_args() fname_embedder = 'data/dataset/model/%s_embedder.pkl'%(opts.key_embedder) embedder = WordEmbedder.load(fname_embedder) print >> sys.stderr, 'ubm_builder: [info] preparing x' iterator = DBTextIterator(50000000) x = [] for seq in iterator: if len(seq) == 0: continue x.append(np.mean(embedder.embed(seq), axis = 0)) for n in [8, 4, 16, 32]: print >> sys.stderr, 'ubm_builder: [info] fitting model for n = %d ...'%(n), st = time.time() ubm = GMM(n_components = n) ubm.fit(x) print >> sys.stderr, ' OK (%.2f sec)'%(time.time() - st) cPickle.dump(ubm, open('data/dataset/gmmubm/db_%s_%d.pkl'%(opts.key_embedder, n), 'w'))
class GaussianMixtureModel(method.Method): def __init__(self, params): self.params = dict(params) del params['features'] del params['labels'] self._set_default(params, 'covariance_type', 'full') #self._set_default(params, 'n_iter', 200) self.classifier = GMM(**params) def __str__(self): return "Gaussian Mixture Model from scikit-learn.org" def train(self, catalog): featuresdata = catalog[:,self.params['features']] idlabel = np.array(self.params['features'])[-1] + self.params['labels'] + 1 labelsdata = catalog[:,idlabel] labelsdata = labelsdata.reshape(len(labelsdata)) self.all_labels = np.unique(labelsdata) self.classifier.fit(featuresdata, labelsdata) def predict(self, data): outcat = self.classifier.predict(data) outcat = np.unique(self.all_labels)[outcat] return outcat, 0.
def algo_gmm(previmage,objmask,nextimage): ''' 1. form a mixture model using obj pixels 2. Classify every pixel in nextimage 3. Threshold it and classify''' import sklearn rows = previmage.shape[0] cols = previmage.shape[1] print previmage.shape objpixels = previmage[objmask] bgpix = np.ones((rows,cols)) bgpix[objmask]=0 bgmask = np.where(bgpix==1) bgpixels = previmage[bgmask] print objpixels.shape obj_gmm_model = GMM(n_components=3) obj_gmm_model.fit(objpixels) bg_gmm_model = GMM(n_components=3) #bg_gmm_model.fit(bgpixels) print obj_gmm_model.means_ next_ = nextimage.reshape((rows*cols,3)) print next_.shape nextlabels_obj = obj_gmm_model.predict_proba(next_) #nextlabels_bg = bg_gmm_model.predict_proba(next_) nextlabels_obj = nextlabels_obj.reshape(rows,cols,3) nextlabels = obj_gmm_model.predict(next_) print nextlabels_obj.shape return nextlabels_obj
def extract_gmm_feature( data, max_length_sec = 10 ): try: filename, lbl = data sr,signal = read(filename) if len(signal.shape) > 1: signal = signal[:,0] signal = signal - signal.mean() signal = signal[:max_length_sec*sr] signal = np.array(remove_silence( signal, 0.005 )) if np.sum(signal) == 0.0: print "Empty", filename return filename, None, None mfcc = librosa.feature.mfcc( signal, n_fft = gmm_fft_points, hop_length = gmm_fft_overlap, n_mfcc = gmm_mfcc_coefficients, fmax = 5000 ) #mfcc = preprocess_mfcc(mfcc) delta_mfcc_1 = delta( mfcc, order = 1 ) delta_mfcc_2 = delta( mfcc, order = 2 ) total_features = np.vstack( [ mfcc, delta_mfcc_1, delta_mfcc_2 ] ) total_features = np.transpose( total_features ) total_features = preprocess_mfcc( total_features ) #total_features = StandardScaler().fit_transform( total_features ) gmm = GMM(n_components=1) gmm.fit( total_features ) res_features = np.hstack( [gmm.means_[0], gmm.covars_[0]] ) #print gmm.means_.shape #result_features = np.vstack( [ gmm. ] ) return filename, lbl, res_features except Exception,e: print e return filename, None, None
def predict(self, author_id): author = self.db.get_author(author_id, reduced=True) descriptor = self.get_matrix([author], True) if self.scaler: descriptor = self.scaler.transform(descriptor) if self.pca: descriptor = self.pca.transform(descriptor) descriptor = descriptor[0] unknown_descriptor = self.get_matrix([author], False) if self.scaler: unknown_descriptor = self.scaler.transform(unknown_descriptor) if self.pca: unknown_descriptor = self.pca.transform(unknown_descriptor) ud = unknown_descriptor[0] ws = self.bg_classifier.weights_ ms = self.bg_classifier.means_ cvs = self.bg_classifier.covars_ agm = GMM(n_components=self.components, covariance_type=self.tp) agm.weights_, agm.means_, agm.covars_ = \ self.em(ws, ms, cvs, [descriptor], self.r) if agm.score(ud)/self.bg_classifier.score(ud) < self.threshold: return 1.0 else: return 0.0 return 0.100
def _accumulate_sufficient_statistics(self, stats, obs, framelogprob, posteriors, fwdlattice, bwdlattice, params): super(GMMHMM, self)._accumulate_sufficient_statistics( stats, obs, framelogprob, posteriors, fwdlattice, bwdlattice, params) for state, g in enumerate(self.gmms_): _, lgmm_posteriors = g.score_samples(obs) lgmm_posteriors += np.log(posteriors[:, state][:, np.newaxis] + np.finfo(np.float).eps) gmm_posteriors = np.exp(lgmm_posteriors) tmp_gmm = GMM(g.n_components, covariance_type=g.covariance_type) n_features = g.means_.shape[1] tmp_gmm._set_covars( distribute_covar_matrix_to_match_covariance_type( np.eye(n_features), g.covariance_type, g.n_components)) norm = tmp_gmm._do_mstep(obs, gmm_posteriors, params) if np.any(np.isnan(tmp_gmm.covars_)): raise ValueError stats['norm'][state] += norm if 'm' in params: stats['means'][state] += tmp_gmm.means_ * norm[:, np.newaxis] if 'c' in params: if tmp_gmm.covariance_type == 'tied': stats['covars'][state] += tmp_gmm.covars_ * norm.sum() else: cvnorm = np.copy(norm) shape = np.ones(tmp_gmm.covars_.ndim) shape[0] = np.shape(tmp_gmm.covars_)[0] cvnorm.shape = shape stats['covars'][state] += tmp_gmm.covars_ * cvnorm
def clusterDataSpec(data, k, algorithm): ''' Cluster the given data into a number of clusters determined by BIC. @param data: 2D numpy array holding our data. @param algorithm: @raise LogicalError if algorithm is other than "k-means" or "GMM" @return The predicted labels (clusters) for every example. ''' if algorithm not in ["k-means", "GMM"]: raise LogicalError, "Method %s: Clustering is made only through K-means or GMM." %(stack()[0][3]) print "Clustering for k=%d." %(k) if algorithm == "k-means": whiten(data) codebook, _distortion = kmeans(data, k, 10) # 10 iterations only to make it faster else: g = GMM(n_components=k,thresh = 1e-05, covariance_type='diag', n_iter=10) g.fit(data) #print "Optimal number of clusters according to BIC: %d." %(optimalK) # Return predicted labels if algorithm == "k-means": return vq(data, codebook)[0] # predictions on the same data else: return g.predict(data) # predictions on the same data
class PcaGmm(BaseEstimator): def __init__(self, X_all, pca_components = 12, gmm_components = 4, covariance_type = "full", min_covar = 0.1, gamma = 0, C = 1.0): self.pca_components = pca_components self.gmm_components = gmm_components self.covariance_type = covariance_type self.min_covar = min_covar self.gamma = gamma self.C = C self.X_all = X_all X_all = X_all[:, :pca_components] self.gmm = GMM(n_components = gmm_components, covariance_type = covariance_type, min_covar = min_covar) self.gmm.fit(X_all) def fit(self, X, y): X = X[:, :self.pca_components] X = self.gmm.predict_proba(X) self.svm = SVC(C = self.C, gamma = self.gamma) self.svm.fit(X, y) def predict(self, X): X = X[:, :self.pca_components] return self.svm.predict(self.gmm.predict_proba(X)) def score(self, X, y): y_pred = self.predict(X) return accuracy_score(y, y_pred) def transform(self, X, y = None): X = X[:, :self.pca_components] return self.gmm.predict_proba(X) def __str__(self): return "PCA(%d)-GMM(%d, %s, %f)-SVM(C=%f, gamma=%f)" % (self.pca_components, self.gmm_components,self.covariance_type, self.min_covar,self.C, self.gamma)
class Event(object): def __init__(self): self.clusters = [] self.gmm = None def draw(self): self.fig = plt.figure(figsize=(10,10)) colors = 'rbgcm' for i, cluster in enumerate(self.clusters): color = colors[ i % len(colors) ] cluster.draw(color) if self.gmm: for icircle in range(self.gmm.n_components): mean = self.gmm.means_[icircle] covar = self.gmm.covars_[icircle] sigma = np.sqrt(covar[0]) g = Gaussian(mean, sigma) g.draw() def reconstruct(self, nclusters=None): if nclusters is None: nclusters = len(self.clusters) self.gmm = GMM(n_components=nclusters, covariance_type='spherical', init_params='wc', n_iter=10) self.gmm.fit( self.samples )
def create_random_gmm(n_mix, n_features, covariance_type, prng=0): prng = check_random_state(prng) g = GMM(n_mix, covariance_type=covariance_type) g.means_ = prng.randint(-20, 20, (n_mix, n_features)) g.covars_ = make_covar_matrix(covariance_type, n_mix, n_features) g.weights_ = normalized(prng.rand(n_mix)) return g
def CVK(X, KRange, covar_type="diag", reps=10): N, M = X.shape T = len(KRange) CVE = np.zeros((T, 1)) # K-fold crossvalidation CV = cross_validation.KFold(N, 5, shuffle=True) for t, K in enumerate(KRange): print ("Fitting model for K={0}\n".format(K)) # Fit Gaussian mixture model gmm = GMM(n_components=K, covariance_type=covar_type, n_init=reps, params="wmc").fit(X) # For each crossvalidation fold for train_index, test_index in CV: # extract training and test set for current CV fold X_train = X[train_index] X_test = X[test_index] # Fit Gaussian mixture model to X_train gmm = GMM(n_components=K, covariance_type=covar_type, n_init=reps, params="wmc").fit(X_train) # compute negative log likelihood of X_test CVE[t] += -gmm.score(X_test).sum() # print CVE[t] # Plot results return CVE
def gmm(X, y, M, C, K=4, cov_type="diag", reps=10): # Fit Gaussian mixture model gmm = GMM(n_components=K, covariance_type=cov_type, n_init=reps, params="wmc").fit(X) cls = gmm.predict(X) # extract cluster labels cds = gmm.means_ # extract cluster centroids (means of gaussians) covs = gmm.covars_ # extract cluster shapes (covariances of gaussians) if cov_type == "diag": new_covs = np.zeros([K, M, M]) count = 0 for elem in covs: temp_m = np.zeros([M, M]) for i in range(len(elem)): temp_m[i][i] = elem[i] new_covs[count] = temp_m count += 1 covs = new_covs clusterPlot(X, cls, K, C, y, cds, covs)
def _gmm_from_memberships(data, memberships, covariance_type): clusters = set(memberships) n_clusters = len(clusters) gmm = GMM(n_components=n_clusters, params='m') gmm.weights_ = np.ones([n_clusters])/n_clusters gmm.means_ = np.zeros([n_clusters, data.shape[1]]) if covariance_type == 'diag': gmm.covars_ = np.zeros([n_clusters, data.shape[1]]) if covariance_type == 'spherical': gmm.covars_ = np.zeros([n_clusters]) if covariance_type == 'full': gmm.covars_ = np.zeros([n_clusters, data.shape[1], data.shape[1]]) for cluster in clusters: cluster = int(cluster) indices = (memberships == cluster) gmm.means_[cluster, :] = data[indices, :].mean(axis=0) if covariance_type in ['diag', 'spherical']: #TODO Fix covariance calculation, for now, return cov=1 #D = np.diag(np.cov(data[indices, :].T)) D = np.ones([data.shape[1]]) if covariance_type == 'spherical': gmm.covars_[cluster] = D.mean() else: gmm.covars_[cluster] = D if covariance_type == 'full': cov_estimator = OAS() cov_estimator.fit(data[indices, :]) gmm.covars_[cluster] = cov_estimator.covariance_ return gmm
def plot_elbow(self,start,end): ''' Fit GMM and plot elbow using AIC & BIC ''' from sklearn.mixture import GMM,DPGMM obs = self.X_hmm aics = [] bics = [] for i in range(start,end+1): n_iter=1000 for j in range(1,11): g = GMM(n_components=i,n_iter=n_iter) g.fit(obs) print i converged = g.converged_ if converged: print 'j:%d'%(j) break n_iter += 1000 aics.append(g.aic(obs)) bics.append(g.bic(obs)) if not converged: print 'Not Converged!!' fig = plt.figure() ax = fig.add_subplot(111) ax.plot(range(start,end+1),aics,label='AIC') ax.plot(range(start,end+1),bics,label='BIC') ax.set_xlabel("No. of Clusters") ax.set_ylabel("Information Loss") ax.set_xticks(range(start,end+1),minor=True) ax.legend() ax.grid(True,which='both') plt.show()
def clusterEM(train_data, test_data, max): best = -1 best_k = 0 scores = [] best_rs = -1 for rs in range(20): for k in range(2, max+1): em = GMM(n_components=k, random_state=rs) score, dur = score_clustering(em, train_data, test_data) if score > best: best = score best_k = k best_rs = rs print('local best k=%d, rs=%d, score=%.3f' % (best_k, best_rs, best)) print('EM k=%d, rs=%d, score=%.3f' % (best_k, best_rs, best)) em = GMM(n_components=best_k, random_state=best_rs).fit(train_data) clusters = em.predict(test_data) plot_clusters(test_data, clusters, 'EM Clusters c=%d' % best_k) for k in range(2, max+1): em = GMM(n_components=k, random_state=rs) score, dur = score_clustering(em, train_data, test_data) scores.append(score) if k == best_k: print('EM duration: %d' % dur) print('EM k=%d, score=%.3f' % (best_k, best)) return best_rs, scores
def cluster_and_learn_nn(train_data, train_target, test_data, test_target,): # get cluster assignments for training and test data # 2 was the best k per earlier experiments km = KMeans(n_clusters=2, random_state=1).fit(train_data) train_clusters = km.predict(train_data) test_clusters = km.predict(test_data) # add the cluster assignment as a feature train_with_cluster = np.concatenate((train_data, train_clusters.reshape(len(train_clusters), 1)), axis=1) test_with_cluster = np.concatenate((test_data, test_clusters.reshape(len(test_clusters), 1)), axis=1) print('KMeans cluster NN') learn_nn(train_with_cluster, train_target, test_with_cluster, test_target) # repeat with EM # 4 = best c per earlier experiments em = GMM(n_components=4, random_state=1) em.fit(train_data) train_clusters = em.predict(train_data) test_clusters = em.predict(test_data) # add the cluster assignment as a feature train_with_cluster = np.concatenate((train_data, train_clusters.reshape(len(train_clusters), 1)), axis=1) test_with_cluster = np.concatenate((test_data, test_clusters.reshape(len(test_clusters), 1)), axis=1) print('EM cluster NN') learn_nn(train_with_cluster, train_target, test_with_cluster, test_target)
class OneClassGMM2(BaseClassifier): _predict_params = [] _fit_params = [] def __init__(self, *args, **kwargs): pass def fit(self, data, **kwargs): self.gmm = GMM_SKL(2,covariance_type='full') self.gmm.fit(data) pred = self.gmm.predict(data) bcnt = numpy.bincount(pred) self.majority_class_index = numpy.argmax(bcnt) self.direct_threshold = 0.5 def predict(self, data): score = self.gmm.score(data) pred = self.gmm.predict(data) tmp = numpy.ones(pred.shape) * -1 tmp[pred == self.majority_class_index] = 1 self.score = score return tmp def decision_function(self, data): return -self.score
def train(self,obs): obs = numpy.array(obs) obs = obs[:,self.attr] num_components = 10 try: gmm = GMM(n_components=num_components,covariance_type='diag') except: gmm = GMM(n_components=num_components,cvtype='diag') gmm.fit(obs) predictions = gmm.predict(obs) for n in range(num_components): indexes = numpy.where(predictions==n)[0] if len(indexes)>2: s_obs = obs[indexes] self.data.append(s_obs.mean(0)) X = numpy.array(self.data) try: self.model = svm.OneClassSVM(nu=self.nu,gamma=self.gamma) #self.model = svm.OneClassSVM(nu=0.1,gamma=Gamma) self.model.fit(X) except: print "exception in EmoModelOneClassClassifier.train()"
def profile_gmm(cache_dir, group_name, ncomponents=50, filter=None, ipython_profile=None): cache = Cache(cache_dir) group, colnames_group = cpa.db.group_map(group_name, reverse=True, filter=filter) keys = group.keys() subsamples = subsample(cache_dir, [group[g] for g in keys], ipython_profile) subsampled = np.vstack(subsamples) meanvector = np.mean(subsampled, 0) mean_centered = subsampled - meanvector #perform PCA U, s, V = linalg.svd(mean_centered, full_matrices=False) percvar_expl = s ** 2 / np.sum(s ** 2) scores = np.dot(U, np.diag(s)) loadings = np.transpose(V) # Find the number of PCs required to explain x% of variance cutoffpercentage = 80 percvar_cum = np.cumsum(percvar_expl) npc = np.nonzero(percvar_cum > float(cutoffpercentage) / 100)[0][0] if npc < 20: npc = 20 # GMM gmm = GMM(ncomponents, cvtype='full') gmm.fit(scores[:, :npc], n_iter=100000, thresh=1e-7) parameters = [(cache_dir, group[g], gmm, meanvector, loadings[:, :npc]) for g in keys] variables = ['Component %d' % i for i in range(ncomponents)] return Profiles.compute(keys, variables, _compute_mixture_probabilities, parameters, ipython_profile, group_name=group_name)
def cluster_and_label(name, data_to_cluster): data_to_cluster = scale(data_to_cluster) """ km_labels_store_file = folder + '%s_%s_%s_classifier.pickle'%(n_clusters, 'km', name) if load_pickled_labels: with open(km_labels_store_file, 'rb') as fid: km_labels = cPickle.load(fid) else: km = KMeans(n_clusters=n_clusters) km.fit(data_to_cluster) km_labels = km.predict(data_to_cluster) with open(km_labels_store_file, 'wb') as fid: cPickle.dump(km_labels, fid) make_plots('km', name, km_labels) make_tables('km', name, km_labels) """ gmm_labels_store_file = folder + '%s_%s_%s_classifier.pickle'%(n_clusters, 'gmm', name) if load_pickled_labels: with open(gmm_labels_store_file, 'rb') as fid: gmm_labels = cPickle.load(fid) else: gmm = GMM(n_components = n_clusters, covariance_type = 'full') gmm.fit(data_to_cluster) gmm_labels = gmm.predict(data_to_cluster) with open(gmm_labels_store_file, 'wb') as fid: cPickle.dump(gmm_labels, fid) #make_plots('gmm', name, gmm_labels) make_tables('gmm', name, gmm_labels)
def __init__(self, n_components=1, covariance_type='diag', random_state=None, thresh=1e-2, min_covar=1e-3, n_iter=1000, n_init=1, params='', init_params=''): GMM.__init__(self, n_components, covariance_type, random_state, thresh, min_covar, n_iter, n_init, params, init_params)
def test_GMM(Self): X=[0.9,1.,1.9,2.,2.1,1.1] gmm=GMM(n_components=2, covariance_type='spherical', init_params='wc', n_iter=20) gmm.fit(X) y_train_predict=gmm.predict(X) assert list(y_train_predict)==[1,1,0,0,0,1] or list(y_train_predict)==[0,0,1,1,1,0] assert gmm.means_.mean()>1.45 and gmm.means_.mean()<1.55
def gmm_component_filter(self, nc=20, threshold=0.72, show=True): clf = GMM(nc, n_iter=500, random_state=3).fit(self.fiter.y) ss = clf.predict(self.fiter.y) self.fiter.df['p_rk_cg'] = self.fiter.df['profit_cg'].rank() self.fiter.df['ss'] = ss win_top = len(self.fiter.df['profit_cg']) - len(self.fiter.df['profit_cg']) * 0.25 loss_top = len(self.fiter.df['profit_cg']) * 0.25 self.fiter.df['rk'] = 0 self.fiter.df['rk'] = np.where(self.fiter.df['p_rk_cg'] > win_top, 1, self.fiter.df['rk']) self.fiter.df['rk'] = np.where(self.fiter.df['p_rk_cg'] < loss_top, -1, self.fiter.df['rk']) xt = pd.crosstab(self.fiter.df['ss'], self.fiter.df['rk']) xt_pct = xt.div(xt.sum(1).astype(float), axis=0) if show: xt_pct.plot( figsize=(16, 8), kind='bar', stacked=True, title=str('ss') + ' -> ' + str('result')) plt.xlabel(str('ss')) plt.ylabel(str('result')) ZLog.info(xt_pct[xt_pct[-1] > threshold]) ZLog.info(xt_pct[xt_pct[1] > threshold]) self.top_loss_ss = xt_pct[xt_pct[-1] > threshold].index self.top_win_ss = xt_pct[xt_pct[1] > threshold].index return xt, xt_pct
def get_space_color_clusters(img_df, alpha, n_components): img_df = img_df.copy() img_df[['l', 'a', 'b']] = img_df[['l', 'a', 'b']] * alpha mm = GMM(n_components=n_components) img_pred = mm.fit(img_df).predict(img_df) if not mm.converged_: LOGGER.warning( 'Space-color mixture model did not converge for parameters alpha = {}, n_components = {}'\ .format(alpha, n_components)) img_all = pd.concat([img_df, pd.Series(img_pred)], axis=1) img_all = img_all.rename(columns={0: 'c'}) img_all.index.name = 'order' color_clusters = img_all.groupby('c')[['l', 'a', 'b']].mean() n_before = len(img_all) img_merged = img_all.reset_index()\ .merge(color_clusters.reset_index(), on='c', suffixes=['', '_m'])\ .sort('order').set_index('order') assert n_before == len(img_merged),\ 'Some rows were somehow lost during join: size before = {}, size after = {}'.format(n_before, len(img_merged)) assert np.all(img_merged.apply(np.isfinite).apply(np.all)), 'Merged data frame has NA values somehow' return { 'img_df': img_merged, 'alpha': alpha, 'n_components': n_components, 'model': mm }
def GetFeatures(driverID, j): #print driverID driverDir = '../Kaggle/drivers/'+str(driverID) cur_driver_df = pd.DataFrame(np.zeros((200,4000))) for index,tripID in enumerate(tripFiles): #print tripID trip = Trip(driverID,tripID,pd.read_csv(driverDir+'/' + str(tripID) + '.csv')) X = trip.features X=X[(X.v<vlim[1]) & (X.v>vlim[0])] X=X[(X.acc<clim[1]) & (X.acc>clim[0])] X.index = range(X.shape[0]) xN = np.asanyarray(X) #train GMM #gmms = [GMM(n_components=n, covariance_type='full').fit(xN) for n in n_components] #BICs = [gmm.bic(xN) for gmm in gmms] #i_min = np.argmin(BICs) #clf=gmms[i_min] #print '%s components' %(n_components[i_min]) try: clf = GMM(n_components=5, covariance_type='full').fit(xN) X_, Y_ = np.meshgrid(np.linspace(clim[0], clim[1],num=80),np.linspace(vlim[0], vlim[1]),num=40) XX = np.array([X_.ravel(), Y_.ravel()]).T Z = np.exp(clf.score(XX)) cur_driver_df.loc[tripID] = Z except: print 'exception driver %d trip %d' %(driverID,tripID) cur_driver_df.loc[1:].to_csv(featuresDir+'/' + str(driverID) + '.csv', index=False) return 0
def _find_best_split(self): print "OpenCL Regression split" #X=[0.9,1.,1.9,2.,2.1,1.1] X=[prediction for (feature_value,prediction) in enumerate(self._first_feature())] gmm=GMM(n_components=2, covariance_type='spherical', init_params='wc', n_iter=10) gmm.fit(X) classes=numpy.array(gmm.predict(X)).astype(numpy.float32) y_dim=self._seen_samples() x_dim=self.number_of_decision_functions A=numpy.empty((y_dim,x_dim)).astype(numpy.float32) for i, feature in enumerate(self.randomly_selected_features): for j, (feature_value, prediction) in enumerate(self.samples[feature]): A[j,i]=feature_value gini_matrix=RegressionTreeSecretOpenCL.opencl_calc.opencl_gini_matrix(A, classes) argmax=gini_matrix.argmax() y_max=argmax/x_dim x_max=argmax%x_dim feature_value=A[y_max,x_max] feature=self.randomly_selected_features[x_max] best_split={ 'left': numpy.array([x[1] for x in self.samples[feature] if x[0]<=feature_value]), 'right': numpy.array([x[1] for x in self.samples[feature] if x[0]>feature_value]), 'threshold':feature_value, 'feature':feature } best_split_score=gini_matrix[y_max, x_max] return (best_split, best_split_score)
def build_dictionary(self, features): """ :param features: numpy array of shape [n_samples, n_features] """ # compute mean and covariance matrix for the PCA pca_mean = features.mean(axis=0) features = features - pca_mean cov = np.dot(features.T, features) # compute PCA matrix and keep only pca_dimension dimensions eigvals, eigvecs = np.linalg.eig(cov) perm = eigvals.argsort() pca_transform = eigvecs[:, perm[-self.pca_dimension:]] # transform sample with PCA features = np.dot(features, pca_transform) # train GMM gmm = GMM(n_components=self.dictionary_size) gmm.fit(features) self.pca_mean = pca_mean self.pca_transform = pca_transform self.weights = gmm.weights_ self.means = gmm.means_ self.covariance = np.sqrt(1 / gmm.covars_) self.gmm = gmm
# 肝功能异常指标 # 针对已有特征,选择"*天门冬氨酸氨基转换酶","*丙氨酸氨基转换酶","*碱性磷酸酶","*r-谷氨酰基转换酶","白蛋白","*球蛋白"作为特征 # 通过上面的特征,做聚类分出肝功能是否异常 def temp_feature(x,Value): if x < Value: return 0 else: return 1 temp_df = df[["*天门冬氨酸氨基转换酶","*丙氨酸氨基转换酶","*碱性磷酸酶","*r-谷氨酰基转换酶","白蛋白","*球蛋白"]] temp_df["temp_lable0"] = temp_df.loc[:,"*天门冬氨酸氨基转换酶"].apply(lambda x: temp_feature(x,40)) temp_df["temp_lable1"] = temp_df.loc[:,"*丙氨酸氨基转换酶"].apply(lambda x: temp_feature(x,40)) temp_df["temp_lable2"] = temp_df.loc[:,"*碱性磷酸酶"].apply(lambda x: temp_feature(x,185)) temp_df["temp_lable3"] = temp_df.loc[:,"*r-谷氨酰基转换酶"].apply(lambda x: temp_feature(x,35)) temp_df["temp_lable4"] = temp_df.loc[:,"白蛋白"].apply(lambda x: temp_feature(x,55)) temp_df["temp_lable5"] = temp_df.loc[:,"*球蛋白"].apply(lambda x: temp_feature(x,35)) gmm = GMM(n_components=2).fit(temp_df) labels = gmm.predict(temp_df) df["liver_trouble_feature"] = labels # 专家指出,血糖和尿酸(肾功能)存在相关性,糖尿病到一定程度会影响肾功能 # 于是,这里额外添加一个特征标记高尿酸的人群 def temp_feature(x,Value): if x < Value: return 0 else: return 1 df["high_feature_UA"] = df["尿酸"].apply(lambda x: temp_feature(x,420)) # 数据规范化 # 不同特征之间的均值差别还是有点大,所以这里做一下特征缩放,将数据压缩到区间[-1,1]. # 只对特征名中不含有feature的特征处理
sr, audio = read(source + path) #audio = (audio/ 32767).astype(int) #print(audio) #print(sr) # extract 40 dimensional MFCC & delta MFCC features vector = extract_features(audio, sr) if features.size == 0: features = vector else: features = np.vstack((features, vector)) # when features of 5 files of speaker are concatenated, then do model training if count == 5: print(np.mean(features)) gmm = GMM(n_components=16, max_iter=200, covariance_type='diag', n_init=3) gmm.fit(features) # dumping the trained gaussian model picklefile = path.split("-")[0] + ".gmm" print(picklefile) cPickle.dump(gmm, open(dest + picklefile, 'wb')) print('+ modeling completed for speaker:', picklefile, " with data point = ", features.shape) features = np.asarray(()) count = 0 count = count + 1
def train(args): print("Loading embeddings.") fname = "{}/labels.csv".format(args.workDir) labels = pd.read_csv(fname, header=None).as_matrix()[:, 1] labels = map(itemgetter(1), map(os.path.split, map(os.path.dirname, labels))) # Get the directory. fname = "{}/reps.csv".format(args.workDir) embeddings = pd.read_csv(fname, header=None).as_matrix() le = LabelEncoder().fit(labels) labelsNum = le.transform(labels) nClasses = len(le.classes_) print("Training for {} classes.".format(nClasses)) if args.classifier == 'LinearSvm': clf = SVC(C=1, kernel='linear', probability=True) elif args.classifier == 'GridSearchSvm': print(""" Warning: In our experiences, using a grid search over SVM hyper-parameters only gives marginally better performance than a linear SVM with C=1 and is not worth the extra computations of performing a grid search. """) param_grid = [{ 'C': [1, 10, 100, 1000], 'kernel': ['linear'] }, { 'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['rbf'] }] clf = GridSearchCV(SVC(C=1, probability=True), param_grid, cv=5) elif args.classifier == 'GMM': # Doesn't work best clf = GMM(n_components=nClasses) # ref: # http://scikit-learn.org/stable/auto_examples/classification/plot_classifier_comparison.html#example-classification-plot-classifier-comparison-py elif args.classifier == 'RadialSvm': # Radial Basis Function kernel # works better with C = 1 and gamma = 2 clf = SVC(C=1, kernel='rbf', probability=True, gamma=2) elif args.classifier == 'DecisionTree': # Doesn't work best clf = DecisionTreeClassifier(max_depth=20) elif args.classifier == 'GaussianNB': clf = GaussianNB() # ref: https://jessesw.com/Deep-Learning/ elif args.classifier == 'DBN': from nolearn.dbn import DBN clf = DBN( [embeddings.shape[1], 500, labelsNum[-1:][0] + 1 ], # i/p nodes, hidden nodes, o/p nodes learn_rates=0.3, # Smaller steps mean a possibly more accurate result, but the # training will take longer learn_rate_decays=0.9, # a factor the initial learning rate will be multiplied by # after each iteration of the training epochs=300, # no of iternation # dropouts = 0.25, # Express the percentage of nodes that # will be randomly dropped as a decimal. verbose=1) if args.ldaDim > 0: clf_final = clf clf = Pipeline([('lda', LDA(n_components=args.ldaDim)), ('clf', clf_final)]) clf.fit(embeddings, labelsNum) fName = "{}/classifier.pkl".format(args.workDir) print("Saving classifier to '{}'".format(fName)) with open(fName, 'w') as f: pickle.dump((le, clf), f)
def parcellate_region_1(roilist, sub, nClusters, scan, scan_type, savepng=0, session=1, algo=0, type_cor=0): p_dir = '/home/ajoshi/HCP_data' r_factor = 3 ref_dir = os.path.join(p_dir, 'reference') ref = '100307' fn1 = ref + '.reduce' + str(r_factor) + '.LR_mask.mat' fname1 = os.path.join(ref_dir, fn1) msk = scipy.io.loadmat(fname1) # h5py.File(fname1); dfs_left = readdfs( os.path.join(p_dir, 'reference', ref + '.aparc.a2009s.32k_fs.reduce3.' + 'left' + '.dfs')) dfs_left_sm = readdfs( os.path.join( p_dir, 'reference', ref + '.aparc.\ a2009s.32k_fs.reduce3.very_smooth.' + 'left' + '.dfs')) data = scipy.io.loadmat( os.path.join( p_dir, sub, sub + '.rfMRI_REST' + str(session) + scan + '.reduce3.ftdata.NLM_11N_hvar_25.mat')) LR_flag = msk['LR_flag'] LR_flag = np.squeeze(LR_flag) > 0 data = data['ftdata_NLM'] temp = data[LR_flag, :] m = np.mean(temp, 1) temp = temp - m[:, None] s = np.std(temp, 1) + 1e-16 temp = temp / s[:, None] msk_small_region = np.in1d(dfs_left.labels, roilist) # (dfs_left.labels == 46) | (dfs_left.labels == 28) \ # | (dfs_left.labels == 29) # % motor d = temp[msk_small_region, :] rho = np.corrcoef(d) rho[~np.isfinite(rho)] = 0 # rho=np.abs(rho) d_corr = temp[~msk_small_region, :] rho_1 = np.corrcoef(d, d_corr) rho_1 = rho_1[range(d.shape[0]), d.shape[0]:] rho_1[~np.isfinite(rho_1)] = 0 if type_cor == 1: # f_rho=np.arctanh(rho_1) # f_rho[~np.isfinite(f_rho)]=0 B = np.corrcoef(rho_1) B[~np.isfinite(B)] = 0 affinity_matrix = affinity_mat(B) affinity_matrix[~np.isfinite(affinity_matrix)] = 0 # B = np.abs(B) # SC = DBSCAN() if algo == 0: SC = SpectralClustering(n_clusters=nClusters, affinity='precomputed') # SC=SpectralClustering(n_clusters=nClusters,gamma=0.025) if type_cor == 0: affinity_matrix = affinity_mat(rho) labels = SC.fit_predict(affinity_matrix) if type_cor == 1: labels = SC.fit_predict(affinity_matrix) # affinity_matrix=SC.fit(np.abs(d)) elif algo == 1: g = nx.Graph() g.add_edges_from(dfs_left.faces[:, (0, 1)]) g.add_edges_from(dfs_left.faces[:, (1, 2)]) g.add_edges_from(dfs_left.faces[:, (2, 0)]) Adj = nx.adjacency_matrix(g) AdjS = Adj[(msk_small_region), :] AdjS = AdjS[:, (msk_small_region)] AdjS = AdjS.todense() np.fill_diagonal(AdjS, 1) SC = AgglomerativeClustering(n_clusters=nClusters, connectivity=AdjS) labels = SC.fit_predict(rho) elif algo == 2: GM = GMM(n_components=nClusters, covariance_type='full', n_iter=100) GM.fit(rho) labels = GM.predict(rho) elif algo == 3: neighbour_correlation(rho, dfs_left_sm.faces, dfs_left_sm.vertices, msk_small_region) if savepng > 0: r = dfs_left_sm r.labels = np.zeros([r.vertices.shape[0]]) r.labels[msk_small_region] = labels + 1 cent = separate(labels, r, r.vertices, nClusters) manual_order = np.array([0 for x in range(nClusters)]) save = np.array([0 for x in range(nClusters)]) for i in range(0, nClusters): if nClusters > 1: choose_vector = np.argmax(cent.transpose(), axis=1) save[i] = cent[choose_vector[1]][1] correspondence_point = find_location_smallmask( r.vertices, cent[choose_vector[1]], msk_small_region) cent[choose_vector[1]][1] = -np.Inf manual_order[i] = choose_vector[1] if i == 0: # change correlation_within_precuneus_vector = sp.array( rho[correspondence_point]) correlation_with_rest_vector = sp.array( rho_1[correspondence_point]) else: correlation_within_precuneus_vector = sp.vstack([ correlation_within_precuneus_vector, [rho[correspondence_point]] ]) correlation_with_rest_vector = sp.vstack([ correlation_with_rest_vector, [rho_1[correspondence_point]] ]) else: choose_vector = 0 correspondence_point = find_location_smallmask( r.vertices, cent, msk_small_region) manual_order[i] = choose_vector if i == 0: # change correlation_within_precuneus_vector = sp.array( rho[correspondence_point]) correlation_with_rest_vector = sp.array( rho_1[correspondence_point]) manual_order = change_order(manual_order, nClusters) r.labels = change_labels(r.labels, manual_order, nClusters) new_cent = separate(r.labels, r, temp, nClusters) if nClusters > 1: for i in range(0, nClusters): cent[manual_order[i]][1] = save[i] '''mlab.triangular_mesh(r.vertices[:, 0], r.vertices[:, 1], r.vertices[:, 2], r.faces, representation='surface', opacity=1, scalars=np.float64(r.labels)) for i in range(nClusters): mlab.points3d(new_cent[i][0], new_cent[i][1], new_cent[i][2]) mlab.gcf().scene.parallel_projection = True mlab.view(azimuth=0, elevation=90) mlab.colorbar(orientation='horizontal') mlab.draw() mlab.savefig(filename='clusters_' + str(nClusters) + '_rois_' + str(roilist) + 'subject_' + sub + 'session' + str(session) + '_labels.png') mlab.close()''' # return (r,correspondence_vector,msk_small_region) return (r, correlation_within_precuneus_vector, correlation_with_rest_vector, msk_small_region, new_cent)
def fit_new(self, x, label): self.y.append(label) gmm = GMM(self.gmm_order) gmm.fit(x) self.gmms.append(gmm)
def clustering_experiment(X, y, name, clusters, rdir): """Generate results CSVs for given datasets using the K-Means and EM clustering algorithms. Args: X (Numpy.Array): Attributes. y (Numpy.Array): Labels. name (str): Dataset name. clusters (list[int]): List of k values. rdir (str): Output directory. """ sse = defaultdict(dict) # sum of squared errors logl = defaultdict(dict) # log-likelihood bic = defaultdict(dict) # BIC for EM aic = defaultdict(dict) # AIC for EM aic = defaultdict(dict) # AIC for EM silhouette = defaultdict(dict) # silhouette score acc = defaultdict(lambda: defaultdict(dict)) # accuracy scores adjmi = defaultdict(lambda: defaultdict(dict)) # adjusted mutual info h**o = defaultdict(lambda: defaultdict(dict)) # adjusted mutual info km = KMeans(random_state=0) # K-Means gmm = GMM(random_state=0) # Gaussian Mixture Model (EM) # start loop for given values of k print('DATESET: %s' % name) for k in clusters: print('K: %s' % k) km.set_params(n_clusters=k) gmm.set_params(n_components=k) km.fit(X) gmm.fit(X) # calculate SSE, log-likelihood, accuracy, and adjusted mutual info sse[k][name] = km.score(X) logl[k][name] = gmm.score(X) acc[k][name]['km'] = cluster_acc(y, km.predict(X)) acc[k][name]['gmm'] = cluster_acc(y, gmm.predict(X)) adjmi[k][name]['km'] = ami(y, km.predict(X)) adjmi[k][name]['gmm'] = ami(y, gmm.predict(X)) h**o[k][name]['km'] = homogeneity_score(y, km.predict(X)) h**o[k][name]['gmm'] = homogeneity_score(y, gmm.predict(X)) # calculate silhouette score for K-Means km_silhouette = silhouette_score(X, km.predict(X)) silhouette[k][name] = km_silhouette # calculate BIC for EM bic[k][name] = gmm.bic(X) aic[k][name] = gmm.aic(X) # generate output dataframes sse = (-pd.DataFrame(sse)).T sse.rename(columns={name: 'sse'}, inplace=True) logl = pd.DataFrame(logl).T logl.rename(columns={name: 'log-likelihood'}, inplace=True) bic = pd.DataFrame(bic).T bic.rename(columns={name: 'bic'}, inplace=True) aic = pd.DataFrame(aic).T aic.rename(columns={name: 'aic'}, inplace=True) silhouette = pd.DataFrame(silhouette).T silhouette.rename(columns={name: 'silhouette_score'}, inplace=True) acc = pd.Panel(acc) acc = acc.loc[:, :, name].T.rename(lambda x: '{}_acc'.format(x), axis='columns') adjmi = pd.Panel(adjmi) adjmi = adjmi.loc[:, :, name].T.rename(lambda x: '{}_adjmi'.format(x), axis='columns') h**o = pd.Panel(h**o) h**o = h**o.loc[:, :, name].T.rename(lambda x: '{}_homo'.format(x), axis='columns') # concatenate all results dfs = (sse, silhouette, logl, bic, aic, acc, adjmi, h**o) metrics = pd.concat(dfs, axis=1) print(metrics) resfile = get_abspath('{}_train_metrics.csv'.format(name), rdir) metrics.to_csv(resfile, index_label='k')
import argos.io as io import argos.plot as tplot import matplotlib.pyplot as plt import numpy as np from sklearn import metrics from sklearn.mixture import GMM traj_list = io.load("1_traj_seg.dt") traj_list = traj_list[:1000] X = np.fromfile("gaussian_representation.dat", dtype=float) D = io.load_distance_matrix("distance1.npz") no_of_cluster = 12 gmm = GMM(n_components=no_of_cluster, n_iter=1000) labels = gmm.fit_predict(X) # Postprocessing clusters = [[] for i in range(no_of_cluster)] no = len(traj_list) for i in range(no): label = int(labels[i]) clusters[label].append(traj_list[i]) silhoutte_score = metrics.silhouette_score(D, labels, sample_size=1000) print("Silhoutte Coefficient : %.3f" % silhoutte_score) # Plotting Clustered Trajectories color_list = plt.rcParams['axes.prop_cycle'].by_key()['color'] for i in range(no_of_cluster):
T_x, T_y = get_graph_segments(model.X_train_, model.full_tree_) T_trunc_x, T_trunc_y = get_graph_segments(model.X_train_, model.cluster_graph_) #------------------------------------------------------------ # Fit a GMM to each individual cluster Nx = 100 Ny = 250 Xgrid = np.vstack( map(np.ravel, np.meshgrid(np.linspace(xmin, xmax, Nx), np.linspace(ymin, ymax, Ny)))).T density = np.zeros(Xgrid.shape[0]) for i in range(n_components): ind = (labels == i) gmm = GMM(4).fit(X[ind]) dens = np.exp(gmm.score(Xgrid)) dens /= dens.max() density += dens density = density.reshape((Ny, Nx)) #---------------------------------------------------------------------- # Plot the results fig = plt.figure(figsize=(7, 8)) fig.subplots_adjust(hspace=0, left=0.1, right=0.95, bottom=0.1, top=0.9) ax = fig.add_subplot(311, aspect='equal') ax.scatter(X[:, 1], X[:, 0], s=1, lw=0, c='k') ax.set_xlim(ymin, ymax) ax.set_ylim(xmin, xmax)
for d in dirs: features = [] print d for i in range(2): f = choice(glob.glob(d + "/*.wav")) fs, signal = wavfile.read(f) mfcc = extractor.extract_differential(signal) features.extend(mfcc) mfccs.append(features) print "start training" gmms = [] for idx, mfcc in enumerate(mfccs): print idx gmm = GMM(32, n_iter=1000, thresh=0.001) gmm.fit(mfcc) gmms.append(gmm) print "done training" def cal_score(model, mfcc): return np.exp(sum(model.score(mfcc)) / 1000) def pred_label(mfcc): scores = [cal_score(gmm, mfcc) for gmm in gmms] return max(enumerate(scores), key=operator.itemgetter(1))[0]
for path in file_paths: path = path.strip() print(path) # read the audio sr, audio = read(source + path) # extract 40 dimensional MFCC & delta MFCC features vector = extract_features(audio, sr) if features.size == 0: features = vector else: features = np.vstack((features, vector)) # when features of 5 files of speaker are concatenated, then do model training if count == 3: gmm = GMM(n_components=16, n_iter=200, covariance_type='diag', n_init=3) gmm.fit(features) # dumping the trained gaussian model temp_path = path.strip(".wav") picklefile = temp_path.strip("3") + ".gmm" cpk.dump(gmm, open(dest + picklefile, 'wb')) print('+ modeling completed for speaker:', picklefile, " with data point = ", features.shape) features = np.asarray(()) count = count + 1
import matplotlib import matplotlib.pyplot as plt textsize = 15 matplotlib.rcParams.update({'font.size': textsize}) plotdir = '../../plot/unsupervised/' datadir = '../../data/unsupervised/' preprossdatadir = '../../data/preprocess/' source = "Cs137" #"Co60" with open(datadir + source + 'featuretrain.dat', 'rb') as f: feature = pickle.load(f) X = feature[:, 4:] gmm = GMM(n_components=2, covariance_type='full', max_iter=100, random_state=20).fit(X) glabels = gmm.predict(X) kmeans = KMeans(n_clusters=2, n_init=20).fit(X) klabels = kmeans.predict(X) density = DBSCAN(eps=0.5, min_samples=10).fit(X) dlabels = density.labels_ # save glabel result with open(preprossdatadir + source + 'normedwaveform0.dat', 'rb') as f: data = pickle.load(f) paradf = data['para'] paradf['glabels'] = glabels with open(datadir + source + 'testresultlabel.dat', 'wb') as f:
# TODO: Apply a PCA transformation to the sample log-data pca_samples = pca.transform(log_samples) # Create a DataFrame for the reduced data reduced_data = pd.DataFrame(reduced_data, columns=['Dimension 1', 'Dimension 2']) # Display sample log-data after applying PCA transformation in two dimensions display( pd.DataFrame(np.round(pca_samples, 4), columns=['Dimension 1', 'Dimension 2'])) # TODO: Apply your clustering algorithm of choice to the reduced data from sklearn.mixture import GMM clusterer = GMM(n_components=2, covariance_type='full', random_state=42) clusterer.fit(reduced_data) # TODO: Predict the cluster for each data point preds = clusterer.predict(reduced_data) # TODO: Find the cluster centers centers = clusterer.means_ # TODO: Predict the cluster for each transformed sample data point sample_preds = clusterer.predict(pca_samples) # TODO: Calculate the mean silhouette coefficient for the number of clusters chosen from sklearn.metrics import silhouette_score score = silhouette_score(reduced_data, preds, random_state=42) print score
train_index, test_index = next(iter(indices)) # Extract training data and labels X_train = iris.data[train_index] y_train = iris.target[train_index] # Extract testing data and labels X_test = iris.data[test_index] y_test = iris.target[test_index] # Extract the number of classes num_classes = len(np.unique(y_train)) # Build GMM classifier = GMM(n_components=num_classes, covariance_type='full', init_params='wc', n_iter=20) # Initialize the GMM means classifier.means_ = np.array( [X_train[y_train == i].mean(axis=0) for i in range(num_classes)]) # Train the GMM classifier classifier.fit(X_train) plt.figure() colors = 'bgr' for i, color in enumerate(colors): # Extract eigenvalues and eigenvectors eigenvalues, eigenvectors = np.linalg.eigh(
def __init__(self, Xpoints, numMixtures): print "Scikits Learn Implementation Chosen" LikelihoodEvaluator.__init__(self, Xpoints, numMixtures) from sklearn.mixture import GMM as GMMEval self.evaluator = GMMEval(n_components=numMixtures) self.Xpoints = Xpoints
################## GMM ##################### # computes accuracy given the predictions and real labels def accuracy(predictions, labels): batch_size = predictions.shape[0] sum = np.sum(predictions == labels) acc = (100.0 * sum) / batch_size return acc n_classes = 10 # 10 genre classes # Try GMMs using different types of covariances. I'm only letting 'full' as it performs better but can add different types to try classifiers = dict((covar_type, GMM(n_components=n_classes, covariance_type=covar_type, init_params='wc', n_iter=5)) for covar_type in ['full']) print("Training GMM") for index, (name, classifier) in enumerate(classifiers.items()): # Since we have class labels for the training data, we can # initialize the GMM parameters in a supervised manner. classifier.means_ = np.array([train_data[train_labels == i].mean(axis=0) for i in range(n_classes)]) # Train the other parameters using the EM algorithm. classifier.fit(train_data) # getting predictions of training set
def training(self): self.gmm = GMM(n_components = 2, covariance_type = 'diag', verbose = False ) self.gmm.fit(self.train)
columns=[ 'job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'day_of_week', 'poutcome' ]) X.dropna() X['y'].value_counts() X['y'] = X['y'].map({'yes': 1, 'no': 0}) y = X['y'] X = X.drop(['y'], axis=1) pca = LinearDiscriminantAnalysis(n_components=1) X = pca.fit_transform(X, y) X_train, y_train, X_test, y_test = train_test_split(X, y, stratify=y, random_state=0) # n_components = np.arange(1, 2) # models = [GMM(n, covariance_type='full', random_state=0).fit(X_train) # for n in n_components] # # plt.plot(n_components, [m.bic(X_train) for m in models], label='BIC') # plt.plot(n_components, [m.aic(X_train) for m in models], label='AIC') # plt.legend(loc='best') # plt.xlabel('n_components') model = GMM(3, covariance_type='full', random_state=0).fit(X) cluster_labels = model.predict(X) print('NMI: {}'.format(metrics.normalized_mutual_info_score(y, cluster_labels))) print('Homogeneity: {}'.format(metrics.homogeneity_score(y, cluster_labels))) print('Completeness: {}'.format(metrics.completeness_score(y, cluster_labels))) #plt.savefig('ds2_gmm_rp.png')
from sklearn.model_selection import GridSearchCV from sklearn.model_selection import cross_val_score from sklearn.model_selection import train_test_split # Neural Network Lib from sklearn.neural_network import MLPClassifier from sklearn.cluster import KMeans from sklearn.mixture import GaussianMixture as GMM # Get Data l = ['FA', 'ICA', 'PCA', 'RP'] o = [] for al in l: print(al) data = pd.read_csv('../datasets/{}_credit.csv'.format(al)) y = data.default X = data.drop('default', axis=1) model = GMM(n_components=2).fit(X) o = model.predict(X) data['cluster_labels'] = o data.to_csv('../datasets/reduced_clustered_dataset_gmm_{}.csv'.format(al))
from sklearn.mixture import GMM import numpy as np from math import * import pandas as pd import matplotlib.pyplot as plt from scipy import stats, integrate import seaborn as sns sns.set(color_codes=True) #Data Generation gmm = GMM(2,covariance_type='diag') gmm.means_ = np.array([[1], [4]]) gmm.weights_ = np.array([0.5, 0.5]) gmm.covars_ = np.array([[1], [1]]) X = gmm.sample(1000) #Histogram num_bins =50 n, bins, patches = plt.hist(X, num_bins, normed=1, facecolor='green', alpha=0.5) plt.show() ################################################# Gibbs Sampling Algorithm ################################################# poids=0.5 #Initialization theta_p=[] theta_p.append(-0.2)
count = 1 # Extracting features for each speaker (5 files per speakers) features = np.asarray(()) for path in file_paths: path = path.strip() print path # read the audio sr,audio = read(source + path) # extract 40 dimensional MFCC & delta MFCC features vector = extract_features(audio,sr) if features.size == 0: features = vector else: features = np.vstack((features, vector)) # when features of 5 files of speaker are concatenated, then do model training # -> if count == 5: --> edited below if count == 15: gmm = GMM(n_components = 16, covariance_type='diag',n_init = 3) gmm.fit(features) # dumping the trained gaussian model picklefile = path.split("-")[0]+".gmm" cPickle.dump(gmm,open(dest + picklefile,'w')) print '+ modeling completed for speaker:',picklefile," with data point = ",features.shape features = np.asarray(()) count = 0 count = count + 1
def get_gmm(data, tdata, num_classes): gmm = GMM(n_components=num_classes).fit(data) lout = gmm.predict(data) lout2 = gmm.predict(tdata) return lout.reshape(lout.shape[0], 1), lout2.reshape(lout2.shape[0], 1)
def run_all_classifiers(X_train, X_test, y_train, y_test, print_output_scores_to_csv=False, output_scores_csv_file_suffix='', print_only_table=False): """ The list of all classifiers was generated by running the following commented code. Args: a_X_train, a_X_test, a_y_train, a_y_test: The train and tests datasets. a_print_output_scores_to_csv: If True the Precision, Recall, F1-Score and Support for both classes will be printed to a file with the current date and time. a_output_scores_csv_file_suffix: Suffix to be added to the csv file just before the .csv extension. Normally describing the run that is being performed. Returns: dataset: Returns output scores dataset. """ assert isinstance(X_train, pd.core.frame.DataFrame) assert isinstance(X_test, pd.core.frame.DataFrame) assert isinstance(y_train, pd.core.frame.Series) assert isinstance(y_test, pd.core.frame.Series) assert isinstance(print_output_scores_to_csv, bool) assert isinstance(output_scores_csv_file_suffix, object) import time # https://stackoverflow.com/questions/42160313/how-to-list-all-classification-regression-clustering-algorithms-in-scikit-learn #from sklearn.utils.testing import all_estimators #estimators = all_estimators() #for name, class_ in estimators: # log_print(name) from sklearn.calibration import CalibratedClassifierCV from sklearn.discriminant_analysis import LinearDiscriminantAnalysis from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis from sklearn.ensemble import AdaBoostClassifier from sklearn.ensemble import BaggingClassifier from sklearn.ensemble import ExtraTreesClassifier from sklearn.ensemble import GradientBoostingClassifier from sklearn.ensemble import RandomForestClassifier from sklearn.gaussian_process import GaussianProcessClassifier from sklearn.linear_model import LogisticRegression from sklearn.linear_model import LogisticRegressionCV from sklearn.linear_model import SGDClassifier from sklearn.mixture import BayesianGaussianMixture from sklearn.mixture import DPGMM from sklearn.mixture import GaussianMixture from sklearn.mixture import GMM from sklearn.mixture import VBGMM from sklearn.naive_bayes import BernoulliNB from sklearn.naive_bayes import GaussianNB from sklearn.neighbors import KNeighborsClassifier from sklearn.neural_network import MLPClassifier from sklearn.semi_supervised import LabelPropagation from sklearn.semi_supervised import LabelSpreading from sklearn.svm import SVC from sklearn.tree import DecisionTreeClassifier #from xgboost import XGBClassifier models = [] models.append(('AdaBoostClassifier', AdaBoostClassifier())) models.append(('BaggingClassifier', BaggingClassifier())) models.append(('BayesianGaussianMixture', BayesianGaussianMixture())) models.append(('BernoulliNB', BernoulliNB())) models.append(('CalibratedClassifierCV', CalibratedClassifierCV())) models.append(('DPGMM', DPGMM())) models.append(('DecisionTreeClassifier', DecisionTreeClassifier(random_state=SEED))) models.append(('ExtraTreesClassifier', ExtraTreesClassifier(random_state=SEED))) models.append(('GMM', GMM())) models.append(('GaussianMixture', GaussianMixture())) models.append(('GaussianNB', GaussianNB())) models.append(('GaussianProcessClassifier', GaussianProcessClassifier())) models.append(('GradientBoostingClassifier', GradientBoostingClassifier())) models.append(('KNeighborsClassifier', KNeighborsClassifier())) models.append(('LabelPropagation', LabelPropagation())) models.append(('LabelSpreading', LabelSpreading())) models.append(('LinearDiscriminantAnalysis', LinearDiscriminantAnalysis())) models.append(('LogisticRegression', LogisticRegression())) models.append(('LogisticRegressionCV', LogisticRegressionCV())) models.append(('MLPClassifier', MLPClassifier())) #models.append(('MultinomialNB', MultinomialNB())) #models.append(('NuSVC', NuSVC())) models.append(('QuadraticDiscriminantAnalysis', QuadraticDiscriminantAnalysis())) models.append(('RandomForestClassifier', RandomForestClassifier(random_state=SEED))) models.append(('SGDClassifier', SGDClassifier())) models.append(('SVC', SVC())) models.append(('VBGMM', VBGMM())) #models.append(('XGBClassifier', XGBClassifier())) output_scores_df = fit_predict_plot(X_train, X_test, y_train, y_test, models, print_only_table) if print_output_scores_to_csv: output_scores_df.to_csv(time.strftime('output_scores' + str(output_scores_csv_file_suffix) + '.csv') return output_scores_df def run_all_classifiers(X_train, X_test, y_train, y_test, print_details=True): """ Run all classifiers of sklearn Args: X_train, X_test, y_train, y_test: The train and tests datasets. print_details: if true, print details of all models and save csv table ; if false, print only table with summary of the models Returns: dataset: Returns output scores dataset. """ assert isinstance(X_train, pd.core.frame.DataFrame) assert isinstance(X_test, pd.core.frame.DataFrame) assert isinstance(y_train, pd.core.frame.Series) assert isinstance(y_test, pd.core.frame.Series) assert isinstance(print_details, bool) log_method_execution_time(log_funcname()) from sklearn.utils.testing import all_estimators import sklearn.metrics import time from src.util.acq_util import RANDOM_SEED # https://stackoverflow.com/questions/42160313/how-to-list-all-classification-regression-clustering-algorithms-in-scikit-learn #from xgboost import XGBClassifier #models.append(('XGBClassifier', XGBClassifier())) models = all_estimators(type_filter='classifier') output_scores_dataset = pd.DataFrame(index=['Precision 0', 'Recall 0', 'F1-Score 0', 'Support 0', 'Precision 1', 'Recall 1', 'F1-Score 1', 'Support 1'], columns=list(zip(*models))[0]) for name, model in models: if print_details is True: print('------------------------------------------------------------------------------') print(name) print('------------------------------------------------------------------------------') if (name == 'MultinomialNB' or name == 'NuSVC' or name == 'RadiusNeighborsClassifier' or name == 'GaussianProcessClassifier'): continue model = model() if 'random_state' in model.get_params(): model.random_state = SEED #Fitting the model. model.fit(X_train, y_train) #Measuring accuracy. y_train_pred = model.predict(X_train) y_test_pred = model.predict(X_test) output_scores_dataset = class_compute_accuracy(y_train, y_train_pred, output_scores_dataset, ['Accuracy on the train set', name], print_details) output_scores_dataset = class_compute_accuracy(y_test, y_test_pred, output_scores_dataset, ['Accuracy on the test set', name], print_details) #Plotting confusion matrix. output_scores_dataset = class_compute_plot_confusion_matrix(y_test, y_test_pred, output_scores_dataset, name, print_details) #Showing classification report. if print_details is True: print(sklearn.metrics.classification_report(y_test, y_test_pred)) # Printing scores to output dataset. output_scores_dataset = class_compute_recall_precision_f1(y_test, y_test_pred, output_scores_dataset, name) # Can use idxmax with axis=1 to find the column with the greatest value on each row. output_scores_dataset['Max Value'] = output_scores_dataset.apply(max, axis=1) #output_scores_dataset['Max Classifier'] = output_scores_dataset.idxmax(axis=1) if print_details is True: output_scores_dataset.to_csv('output_scores' + '.csv') return output_scores_dataset def train_test_split_for_classification(dataset, label, test_size, random_state=SEED): """ Selects X and y, considering that y has been renamed to label. """ from sklearn.model_selection import train_test_split assert isinstance(dataset, pd.core.frame.DataFrame) assert isinstance(test_size, float) assert isinstance(random_state, int) X = dataset.loc[:, dataset.columns != label] y = dataset[g_label] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state, stratify=y) log_print('X_train: {}'.format(X_train.shape)) log_print('y_train: {}'.format(y_train.shape)) log_print('X_test: {}'.format(X_test.shape)) log_print('y_test: {}'.format(y_test.shape)) return(X_train, X_test, y_train, y_test)
def gmm_entropy(points, n_est=None, n_components=None): r""" Use sklearn.mixture.BayesianGaussianMixture to estimate entropy. *points* are the data points in the sample. *n_est* are the number of points to use in the estimation; default is 10,000 points, or 0 for all the points. *n_components* are the number of Gaussians in the mixture. Default is $5 \sqrt{d}$ where $d$ is the number of dimensions. Returns estimated entropy and uncertainty in the estimate. This method uses BayesianGaussianMixture from scikit-learn to build a model of the point distribution, then uses Monte Carlo sampling to determine the entropy of that distribution. The entropy uncertainty is computed from the variance in the MC sample scaled by the number of samples. This does not incorporate any uncertainty in the sampling that generated the point distribution or the uncertainty in the GMM used to model that distribution. """ #from sklearn.mixture import GaussianMixture as GMM from sklearn.mixture import BayesianGaussianMixture as GMM n, d = points.shape # Default to the full set if n_est is None: n_est = 10000 elif n_est == 0: n_est = n # reduce size of draw to n_est if n_est >= n: x = points n_est = n else: x = points[permutation(n)[:n_est]] n = n_est if n_components is None: n_components = int(5 * sqrt(d)) ## Standardization doesn't seem to help ## Note: sigma may be zero #x, mu, sigma = standardize(x) # if standardized predictor = GMM( n_components=n_components, covariance_type='full', #verbose=True, max_iter=1000) predictor.fit(x) eval_x, _ = predictor.sample(n_est) weight_x = predictor.score_samples(eval_x) H = -np.mean(weight_x) #with np.errstate(divide='ignore'): H = H + np.sum(np.log(sigma)) # if standardized dH = np.std(weight_x, ddof=1) / sqrt(n) ## cross-check against own calcs #alt = GaussianMixture(predictor.weights_, mu=predictor.means_, sigma=predictor.covariances_) #print("alt", H, alt.entropy()) #print(np.vstack((weight_x[:10], alt.logpdf(eval_x[:10]))).T) return H / LN2, dH / LN2
def wnn_entropy(points, k=None, weights=True, n_est=None, gmm=None): r""" Weighted Kozachenko-Leonenko nearest-neighbour entropy calculation. *k* is the number of neighbours to consider, with default $k=n^{1/3}$ *n_est* is the number of points to use for estimating the entropy, with default $n_\rm{est} = n$ *weights* is True for default weights, False for unweighted (using the distance to the kth neighbour only), or a vector of weights of length *k*. *gmm* is the number of gaussians to use to model the distribution using a gaussian mixture model. Default is 0, and the points represent an empirical distribution. Returns entropy H in bits and its uncertainty. Berrett, T. B., Samworth, R.J., Yuan, M., 2016. Efficient multivariate entropy estimation via k-nearest neighbour distances. DOI:10.1214/18-AOS1688 https://arxiv.org/abs/1606.00304 """ from sklearn.neighbors import NearestNeighbors n, d = points.shape # Default to the full set if n_est is None: n_est = 10000 elif n_est == 0: n_est = n # reduce size of draw to n_est if n_est >= n: x = points n_est = n else: x = points[permutation(n)[:n_est]] n = n_est # Default k based on n if k is None: # Private communication: cube root of n is a good choice for k # Personal observation: k should be much bigger than d k = max(int(n**(1 / 3)), 3 * d) # If weights are given then use them (setting the appropriate k), # otherwise use the default weights. if isinstance(weights, bool): weights = _wnn_weights(k, d, weights) else: k = len(weights) #print("weights", weights, sum(weights)) # select knn algorithm algorithm = 'auto' #algorithm = 'kd_tree' #algorithm = 'ball_tree' #algorithm = 'brute' n_components = 0 if gmm is None else gmm # H = 1/n sum_i=1^n sum_j=1^k w_j log E_{j,i} # E_{j,i} = e^-Psi(j) V_d (n-1) z_{j,i}^d = C z^d # logC = -Psi(j) + log(V_d) + log(n-1) # H = 1/n sum sum w_j logC + d/n sum sum w_j log(z) # = sum w_j logC + d/n sum sum w_j log(z) # = A + d/n B # H^2 = 1/n sum Psi = digamma(np.arange(1, k + 1)) logVd = d / 2 * log(pi) - gammaln(1 + d / 2) logC = -Psi + logVd + log(n - 1) # TODO: standardizing points doesn't work. # Standardize the data so that distances conform. This is equivalent to # a u-substitution u = sigma x + mu, so the integral needs to be corrected # for dU = det(sigma) dx. Since the standardization squishes the dimensions # independently, sigma is a diagonal matrix, with the determinant equal to # the product of the diagonal elements. #x, mu, sigma = standardize(x) # Note: sigma may be zero #detDU = np.prod(sigma) detDU = 1. if n_components > 0: # Use Gaussian mixture to model the distribution from sklearn.mixture import GaussianMixture as GMM predictor = GMM(n_components=gmm, covariance_type='full') predictor.fit(x) eval_x, _ = predictor.sample(n_est) #weight_x = predictor.score_samples(eval_x) skip = 0 else: # Empirical distribution # TODO: should we use the full draw for kNN and a subset for eval points? # Choose a subset for evaluating the entropy estimate, if desired #print(n_est, n) #eval_x = x if n_est >= n else x[permutation(n)[:n_est]] eval_x = x #weight_x = 1 skip = 1 tree = NearestNeighbors(algorithm=algorithm, n_neighbors=k + skip) tree.fit(x) dist, _ind = tree.kneighbors(eval_x, n_neighbors=k + skip, return_distance=True) # Remove first column. Since test points are in x, the first column will # be a point from x with distance 0, and can be ignored. if skip: dist = dist[:, skip:] # Find log distances. This can be problematic for MCMC runs where a # step is rejected, and therefore identical points are in the distribution. # Ignore them by replacing these points with nan and using nanmean. # TODO: need proper analysis of duplicated points in MCMC chain dist[dist == 0] = nan logdist = log(dist) H_unweighted = logC + d * np.nanmean(logdist, axis=0) H = np.dot(H_unweighted, weights)[0] Hsq_k = np.nanmean((logC[-1] + d * logdist[:, -1])**2) # TODO: abs shouldn't be needed? if Hsq_k < H**2: print("warning: avg(H^2) < avg(H)^2") dH = sqrt(abs(Hsq_k - H**2) / n_est) #print("unweighted", H_unweighted) #print("weighted", H, Hsq_k, H**2, dH, detDU, LN2) return H * detDU / LN2, dH * detDU / LN2
from sklearn.model_selection import GridSearchCV import sys import time out = '../results/clustering/' perm_x, perm_y, housing_x, housing_y = load_data() # perm, housing # np.reshape(perm_y, 30000, order='F') # raise Exception('Remove this line to run code') SSE = defaultdict(dict) # some of squared errors ll = defaultdict(dict) # log likelihood acc = defaultdict(lambda: defaultdict(dict)) adjMI = defaultdict(lambda: defaultdict(dict)) km = kmeans(random_state=5) gmm = GMM(random_state=5) st = time.time() print(len(clusters)) for k in clusters: km.set_params(n_clusters=k) gmm.set_params(n_components=k) km.fit(perm_x) gmm.fit(perm_x) SSE[k]['perm'] = km.score(perm_x) ll[k]['perm'] = gmm.score(perm_x) acc[k]['perm']['Kmeans'] = cluster_acc(perm_y, km.predict(perm_x)) acc[k]['perm']['GMM'] = cluster_acc(perm_y, gmm.predict(perm_x)) adjMI[k]['perm']['Kmeans'] = ami(perm_y, km.predict(perm_x)) adjMI[k]['perm']['GMM'] = ami(perm_y, gmm.predict(perm_x))
def train_from_images(self, filenames): raw_patches, raw_unspread_patches, raw_unspread_patches_padded, raw_originals = self.random_patches_from_images( filenames) if len(raw_patches) == 0: raise Exception( "No patches found, maybe your thresholds are too strict?") # Also store these in "settings" mixtures = [] llhs = [] for i in range(1): mixture = ag.stats.BernoulliMixture(self.num_parts, raw_patches, init_seed=0 + i) mixture.run_EM(1e-8, min_probability=self.settings['min_probability']) mixtures.append(mixture) llhs.append(mixture.loglikelihood) best_i = np.argmax(llhs) mixture = mixtures[best_i] ag.info("Done.") counts = np.bincount(mixture.mixture_components(), minlength=self.num_parts) print(counts) print('Total', np.sum(counts)) from scipy.stats.mstats import mquantiles print(mquantiles(counts)) # Reject weak parts scores = np.empty(self.num_parts) for i in range(self.num_parts): part = mixture.templates[i] sh = part.shape p = part.reshape((sh[0] * sh[1], sh[2])) pec = p.mean(axis=0) N = np.sum(p * np.log(p / pec) + (1 - p) * np.log((1 - p) / (1 - pec))) D = np.sqrt( np.sum(np.log(p / pec * (1 - pec) / (1 - p))**2 * p * (1 - p))) # Old: #D = np.sqrt(np.sum(np.log(p/(1-p))**2 * p * (1-p))) scores[i] = N / D # Require at least 20 occurrences #if counts[i] < 5: #scores[i] = 0 # Only keep with a certain score if not self.settings['bedges']['contrast_insensitive']: visparts = mixture.remix(raw_originals) else: visparts = np.empty((self.num_parts, ) + raw_originals.shape[1:]) #self.extra['originals'] = [] # Improved visparts comps = mixture.mixture_components() for i in range(self.num_parts): ims = raw_originals[comps == i].copy() #self.extra['originals'].append(ims) # Stretch them all out #for j in xrange(len(ims)): #ims[j] = (ims[j] - ims[j].min()) / (ims[j].max() - ims[j].min()) # Now, run a GMM with NM components on this and take the most common NM = 2 from sklearn.mixture import GMM gmix = GMM(n_components=NM) gmix.fit(ims.reshape((ims.shape[0], -1))) visparts[i] = gmix.means_[gmix.weights_.argmax()].reshape( ims.shape[1:]) # Unspread parts unspread_parts_all = mixture.remix(raw_unspread_patches) unspread_parts_padded_all = mixture.remix(raw_unspread_patches_padded) # The parts to keep ok = (scores > 1) & (counts >= 10) #if 'originals' in self.extra: #self.extra['originals'] = list(itr.compress(self.extra['originals'], ok)) scores = scores[ok] counts = counts[ok] self.parts = mixture.templates[ok] self.unspread_parts = unspread_parts_all[ok] self.unspread_parts_padded = unspread_parts_padded_all[ok] self.visparts = visparts[ok] self.num_parts = self.parts.shape[0] # Update num_parts # Store the stuff in the instance #self.parts = mixture.templates #self.visparts = mixture.remix(raw_originals) # Sort the parts according to orientation, for better diagonistics if 1: E = self.parts.shape[-1] E = self.parts.shape[-1] ang = np.array([[0, -1], [1, -1], [1, 0], [1, 1], [0, 1], [-1, 1], [-1, 0], [-1, 1]]) nang = ang / np.expand_dims(np.sqrt(ang[:, 0]**2 + ang[:, 1]**2), 1) orrs = np.apply_over_axes(np.mean, self.parts, [1, 2]).reshape( (self.num_parts, -1)) if E == 8: orrs = orrs[..., :4] + orrs[..., 4:] nang = nang[:4] norrs = orrs / np.expand_dims(orrs.sum(axis=1), 1) dirs = (np.expand_dims(norrs, -1) * nang).sum(axis=1) self.orientations = np.asarray( [math.atan2(x[1], x[0]) for x in dirs]) II = np.argsort(self.orientations) II = np.argsort(scores) scores = scores[II] counts = counts[II] self.extra['scores'] = scores self.extra['counts'] = counts #self.extra['originals'] = [self.extra['originals'][ii] for ii in II] # Now resort the parts according to this sorting self.orientations = self.orientations[II] self.parts = self.parts[II] self.unspread_parts = self.unspread_parts[II] self.unspread_parts_padded = self.unspread_parts_padded[II] self.visparts = self.visparts[II] self._preprocess_logs()
lamb[i] = l break v_old = v_new return (V_est, lamb) cov1 = 0.25 * np.identity(5) cov2 = np.identity(5) Kd = [[-0.5, 1.1, 0.2, -0.9, 0.2], [0.2, -0.1, 0.5, -0.8, 1.0], [-0.3, 0.2, 0.9, 0.7, 1.0], [0.2, 0.9, 0.1, -0.4, 0.5]] dataset1 = datacreation(Kd, cov1) dataset2 = datacreation(Kd, cov2) components = 4 #fitting gaussian mixtures using EM algorithm gmm1 = GMM(n_components=4) gmm1.fit(dataset1) gmm2 = GMM(n_components=4) gmm2.fit(dataset2) print(' Predicted means and covariance of 1st mixture = \n', gmm1.means_) print('\n') print(gmm1.covars_) print('Predicted means and covariances of 2nd mixture = \n', gmm2.means_) print('\n') print(gmm2.covars_) #tensor method X = datacreation(Kd, cov1) mu = calculate_first_moment(X) Sigma = calculate_second_moment(X)
X_2D = model.transform(X_iris) iris['PCA1'] = X_2D[:, 0] iris['PCA2'] = X_2D[:, 1] sns.lmplot("PCA1", "PCA2", hue='species', data=iris, fit_reg=False) print("#---------------------------------------#") print(" Clustering ") print("#---------------------------------------#") print("\n") # Gaussian mixture model (GMM) # covariance:协方差 from sklearn.mixture import GMM model = GMM(n_components=3, covariance_type='full') model.fit(X_iris) y_gmm = model.predict(X_iris) iris['cluster'] = y_gmm sns.lmplot("PCA1", "PCA2", data=iris, hue='species', col='cluster', fit_reg=False) print("#---------------------------------------#") print(" Hand-written digits ") print("#---------------------------------------#") print("\n")
import cv2 from matplotlib import pyplot as plt #Use plant cells to demo the GMM on 2 components #Use BSE_Image to demo it on 4 components #USe alloy.jpg to demonstrate bic and how 2 is optimal for alloy img = cv2.imread("images/BSE.tif") plt.imshow(img) # Convert MxNx3 image into Kx3 where K=MxN img2 = img.reshape((-1, 3)) #-1 reshape means, in this case MxN from sklearn.mixture import GaussianMixture as GMM #covariance choices, full, tied, diag, spherical gmm_model = GMM(n_components=4, covariance_type='tied').fit(img2) #tied works better than full gmm_labels = gmm_model.predict(img2) #Put numbers back to original shape so we can reconstruct segmented image original_shape = img.shape segmented = gmm_labels.reshape(original_shape[0], original_shape[1]) plt.imshow(segmented) #cv2.imwrite("images/segmented.jpg", segmented) ############################################################## #How to know the best number of components? #Using Bayesian information criterion (BIC) to find the best number of components import numpy as np import cv2 img = cv2.imread("images/BSE.tif") img2 = img.reshape((-1, 3))
def __do_perform(self, custom_out=None, main_experiment=None ): # ./output/ICA/clustering//{}', ICAExperiment if custom_out is not None: # if not os.path.exists(custom_out): # os.makedirs(custom_out) self._old_out = self._out # './output/ICA/{}' self._out = custom_out # ./output/ICA/clustering//{}' elif self._old_out is not None: self._out = self._old_out if main_experiment is not None: self.log("Performing {} as part of {}".format( self.experiment_name(), main_experiment.experiment_name())) # 'clustering', 'ICA' else: self.log("Performing {}".format(self.experiment_name())) # Adapted from https://github.com/JonathanTay/CS-7641-assignment-3/blob/master/clustering.py # %% Data for 1-3 sse = defaultdict(list) ll = defaultdict(list) bic = defaultdict(list) sil = defaultdict(lambda: defaultdict(list)) sil_s = np.empty(shape=(2 * len(self._clusters) * self._details.ds.training_x.shape[0], 4), dtype='<U21') acc = defaultdict(lambda: defaultdict(float)) adj_mi = defaultdict(lambda: defaultdict(float)) km = kmeans(random_state=self._details.seed) gmm = GMM(random_state=self._details.seed) st = clock() j = 0 for k in self._clusters: km.set_params(n_clusters=k) gmm.set_params(n_components=k) km.fit( self._details.ds.training_x ) # cluster the ICA-transformed input features using kMeans with varying K gmm.fit( self._details.ds.training_x ) # cluster the ICA-transformed input features using GMM with varying k km_labels = km.predict( self._details.ds.training_x ) # give each ICA-transformed input feature a label gmm_labels = gmm.predict(self._details.ds.training_x) sil[k]['Kmeans'] = sil_score( self._details.ds.training_x, km_labels ) # compute mean silhouette score for all ICA-transformed input features sil[k]['GMM'] = sil_score(self._details.ds.training_x, gmm_labels) km_sil_samples = sil_samples( self._details.ds.training_x, km_labels ) # compute silhouette score for each ICA-transformed input feature gmm_sil_samples = sil_samples(self._details.ds.training_x, gmm_labels) # There has got to be a better way to do this, but I can't brain right now for i, x in enumerate(km_sil_samples): sil_s[j] = [ k, 'Kmeans', round(x, 6), km_labels[i] ] # record the silhouette score x for each instance i given its label kn_labels[i] by kMeans with value k j += 1 for i, x in enumerate(gmm_sil_samples): sil_s[j] = [k, 'GMM', round(x, 6), gmm_labels[i]] j += 1 sse[k] = [ km.score(self._details.ds.training_x) ] # score (opposite of the value of X on the k-Means objective (what is the objective???) ll[k] = [gmm.score(self._details.ds.training_x) ] # per-sample average log-likelihood bic[k] = [ gmm.bic(self._details.ds.training_x) ] # bayesian information criterion (review ???) on the input X acc[k]['Kmeans'] = cluster_acc( self._details.ds.training_y, km_labels ) # compute the accuracy of the clustering algorithm on the ICA-transformed data (against the original y-label) if it predicted the majority y-label for each cluster acc[k]['GMM'] = cluster_acc(self._details.ds.training_y, gmm_labels) adj_mi[k]['Kmeans'] = ami( self._details.ds.training_y, km_labels ) # compute the adjusted mutual information between the true labels and the cluster predicted labels (how well does clustering match truth) adj_mi[k]['GMM'] = ami(self._details.ds.training_y, gmm_labels) self.log("Cluster: {}, time: {}".format(k, clock() - st)) sse = (-pd.DataFrame(sse)).T sse.index.name = 'k' sse.columns = ['{} sse (left)'.format(self._details.ds_readable_name) ] # Bank sse (left) ll = pd.DataFrame(ll).T ll.index.name = 'k' ll.columns = [ '{} log-likelihood'.format(self._details.ds_readable_name) ] # Bank log-likelihood bic = pd.DataFrame(bic).T bic.index.name = 'k' bic.columns = ['{} BIC'.format(self._details.ds_readable_name) ] # Bank BIC sil = pd.DataFrame(sil).T sil_s = pd.DataFrame(sil_s, columns=['k', 'type', 'score', 'label']).set_index('k') #.T # sil_s = sil_s.T acc = pd.DataFrame(acc).T adj_mi = pd.DataFrame(adj_mi).T sil.index.name = 'k' sil_s.index.name = 'k' acc.index.name = 'k' adj_mi.index.name = 'k' # write scores to files sse.to_csv(self._out.format('{}_sse.csv'.format( self._details.ds_name))) ll.to_csv( self._out.format('{}_logliklihood.csv'.format( self._details.ds_name))) bic.to_csv(self._out.format('{}_bic.csv'.format( self._details.ds_name))) sil.to_csv( self._out.format('{}_sil_score.csv'.format(self._details.ds_name))) sil_s.to_csv( self._out.format('{}_sil_samples.csv'.format( self._details.ds_name))) acc.to_csv(self._out.format('{}_acc.csv'.format( self._details.ds_name))) adj_mi.to_csv( self._out.format('{}_adj_mi.csv'.format(self._details.ds_name))) # %% NN fit data (2,3) # train a NN on clustered data grid = { 'km__n_clusters': self._clusters, 'NN__alpha': self._nn_reg, 'NN__hidden_layer_sizes': self._nn_arch } mlp = MLPClassifier(activation='relu', max_iter=2000, early_stopping=True, random_state=self._details.seed) km = kmeans(random_state=self._details.seed, n_jobs=self._details.threads) pipe = Pipeline( [('km', km), ('NN', mlp)], memory=experiments.pipeline_memory ) # run a NN on the clustered data (only on the cluster labels, or input features + cluster labels???) gs, _ = self.gs_with_best_estimator( pipe, grid, type='kmeans') # write the best NN to file self.log("KMmeans Grid search complete") tmp = pd.DataFrame(gs.cv_results_) tmp.to_csv( self._out.format('{}_cluster_kmeans.csv'.format( self._details.ds_name)) ) # write grid search results --> bank_cluster_kmeans.csv grid = { 'gmm__n_components': self._clusters, 'NN__alpha': self._nn_reg, 'NN__hidden_layer_sizes': self._nn_arch } mlp = MLPClassifier(activation='relu', max_iter=2000, early_stopping=True, random_state=self._details.seed) gmm = CustomGMM(random_state=self._details.seed) pipe = Pipeline([('gmm', gmm), ('NN', mlp)], memory=experiments.pipeline_memory) gs, _ = self.gs_with_best_estimator( pipe, grid, type='gmm') # write the best NN to file self.log("GMM search complete") tmp = pd.DataFrame(gs.cv_results_) tmp.to_csv( self._out.format('{}_cluster_GMM.csv'.format( self._details.ds_name)) ) # write grid search results --> bank_cluster_GMM.csv # %% For chart 4/5 # perform TSNE D.R on training data (why???) self._details.ds.training_x2D = TSNE( verbose=10, random_state=self._details.seed).fit_transform( self._details.ds.training_x) ds_2d = pd.DataFrame( np.hstack((self._details.ds.training_x2D, np.atleast_2d(self._details.ds.training_y).T)), columns=['x', 'y', 'target'] ) # prepare NN-learnable data using TSNE D.R'd input features + label ds_2d.to_csv( self._out.format('{}_2D.csv'.format( self._details.ds_name))) # --> bank_2D.csv self.log("Done")