def Get_row_similarity(idx): row = mat_contents['Euclid_matrix'][:, idx] #row = np.square(row) min_covariance = np.min(np.abs(np.diff(row))) / 100 bic_vals = np.array([]) prediction_table = {} bic_table = {} for m in range(2, 11): gmm = GMM(n_components=m, covariance_type='diag', init_params='wc', min_covar=min_covariance) gmm.fit(row) cluster_mean = gmm.means_ sorted_indices = np.argsort( cluster_mean, axis=None) # get index of 2 smallest mean value bic_rating = round(gmm.bic(row), 2) prediction_results = gmm.predict(row) if cluster_mean.shape[0] != np.unique(prediction_results).shape[0]: continue #print 'shape : ' , np.unique(prediction_results).shape[0] #if np.unique(prediction_results).shape[0] == 1: continue #if np.unique(prediction_results).shape[0] > 1: # print 'Mis-match break : ' , cluster_mean.shape[0], np.unique(prediction_results).shape[0] # break prediction_table[m] = [ prediction_results, sorted_indices, cluster_mean ] bic_vals = np.append(bic_vals, bic_rating) bic_table[bic_rating] = m min_bic_vals = np.min(bic_vals) assignment = prediction_table[bic_table[min_bic_vals]][0] two_smallest_indices = prediction_table[bic_table[min_bic_vals]][1] all_means = prediction_table[bic_table[min_bic_vals]][2] unique_assignments = np.unique(assignment) cluster_1 = row[assignment == two_smallest_indices[0]] cluster_2 = row[assignment == two_smallest_indices[1]] if len(two_smallest_indices) > 2: cluster_3 = row[assignment == two_smallest_indices[2]] # Make sure cluster_1 is not too small if len(two_smallest_indices) > 2: print str(idx) + ' cluster merged due small cluster 1' if len(cluster_1) < 10: cluster_1 = np.append(cluster_1, cluster_2) cluster_2 = cluster_3 # Make sure cluster_1 and 2 are not too close mean_1 = np.mean(cluster_1) mean_2 = np.mean(cluster_2) std_1 = np.std(cluster_1) #print 'mean 1 : ' , mean_1 #print 'mean 2 : ' , mean_2 #print 'top : ' , mean_1 + std_1/100.0 #print 'bottom : ' , mean_1 - std_1/100.0 if ((mean_2 < mean_1 + std_1) and (mean_2 > mean_1 - std_1)): print str(idx) + ' cluster merged due to mean proximity' total_cluster = np.sort(np.append(cluster_1, cluster_2)) cluster_1 = total_cluster[0:len(total_cluster) - 4] cluster_2 = total_cluster[-int(np.floor(len(total_cluster) / 3.0)):len(total_cluster)] cluster_1 = np.sort(cluster_1) cluster_2 = np.sort(cluster_2) cluster_len = len(cluster_1) first_half_len = int(np.ceil(cluster_len / 2.0)) cluster_2_half_len = int(np.floor(len(cluster_2) / 2)) second_half_len = cluster_len - first_half_len + len(cluster_2) first_half = cluster_1[0:first_half_len] two_half = cluster_2[0:cluster_2_half_len] second_rest = cluster_1[first_half_len:cluster_len] X = np.expand_dims( np.append(np.append(np.append(cluster_1, two_half), second_rest), cluster_2), 0) Y = np.transpose( np.append(np.ones(cluster_len + cluster_2_half_len), np.zeros(second_half_len))) # print 'Cluster 1 mean : ', np.mean(cluster_1) # print 'Cluster 2 mean : ', np.mean(cluster_2) # print 'Cluster 1 :', cluster_1 # print '\n' # print 'Cluster 2 :', cluster_2 # print int(cluster_len - 10) # print 'cluster len : ' , cluster_1.shape # print '\n' # print 'first_half_len : ' , first_half_len # print '\n' # print second_rest # print '2nd rest : ' , second_rest.shape # print '\n' # print 'Cluster 2: ', cluster_2 # print '\n' # print 'Cluster 2 half: ', two_half # print X.shape # print X # print Y.shape # print Y logreg = linear_model.LogisticRegression(C=1e5) logreg.fit(np.transpose(X), Y) prob_matrix = logreg.predict_proba(np.expand_dims(row, 1)) row_similarity = np.expand_dims(prob_matrix[:, 1], 0) #print prob_matrix lower_range = np.min(row) upper_range = np.max(row) increment = (upper_range - lower_range) / 20.0 logistic_range = np.arange(lower_range, upper_range, increment) logistic_range = np.expand_dims(logistic_range, 1) sigmoid = logreg.predict_proba(logistic_range) sigmoid = sigmoid[:, 1] all_means = np.transpose(np.sort(all_means, 0)) print str(idx) + ' all means : ', all_means f, (ax1, ax2) = plt.subplots(2, 1, sharey=False) ax1.set_title('Histogram and the likelihood drop') histVals = ax1.hist(row, 80) max_hist = np.max(histVals[0]) ax1.plot(logistic_range, max_hist * sigmoid) ax1.plot([np.median(row), np.median(row)], [0, max_hist]) ax1.text(increment, max_hist - 2, all_means) #ax2.plot(row_similarity) ax2.set_title('BIC model selection') ax2.plot(range(2, bic_vals.shape[0] + 2), bic_vals) #plt.show() plt.savefig('histogram_graphs/output_' + str(idx) + '.png') plt.close(f) return row_similarity
def fit_new(self, x, label): self.y.append(label) gmm = GMM(self.gmm_order) gmm.fit(x) self.gmms.append(gmm)
def trainModelFV_LOOCV_Classifiers(self, extension='*.txt'): """ This method contains the entire module required for training the bag of visual words model Use of helper functions will be extensive. """ print('trainModelFV_LOOCV_Classifiers') names = ["Linear SVM"] classifiers = [SVC(kernel='linear')] self.name_dict, self.number_dict, self.count_class = self.file_helper.getLabelsFromFile( self.label_path) # read file. prepare file lists. self.images, self.trainImageCount = self.file_helper.getFilesFromDirectory( self.base_path, self.datasets, extension) self.parameters += 'Classifier Parameters\n' self.parameters += '%s' % self.classifier_helper.clf features_nd = np.asarray(self.images) #features_nd.sort(axis=0) loo = LeaveOneOut() predictions = {} p = {} l = [] hits = {} for name in names: predictions[name] = [] p[name] = [] hits[name] = 0 c = 0 for train, test in loo.split(features_nd): feature_test_file = str(features_nd[test][0][0]) class_name_test = feature_test_file.split(os.sep)[-2] c += 1 currenInvDate = datetime.datetime.now().strftime( "%d/%m/%Y %H:%M:%S") print('Step: %i/%i - %s - %s' % (c, features_nd.shape[0], currenInvDate, feature_test_file)) # if c == 1 or c % 25 == 0: # self.mail_helper.sendMail("Progress: %s - %s" % (self.test_name, self.OsName), "Samples processed: %i" % c) self.descriptor_list = [] self.train_labels = [] for feature in features_nd[train]: feature = feature[0] label_number = self.number_dict[feature.split(os.sep)[-2]] self.train_labels = np.append(self.train_labels, label_number) des = self.file_helper.loadFeaturesFromFile(feature) self.descriptor_list.append(des) # format data as nd array self.classifier_helper.formatND(self.descriptor_list) gmm = GMM(n_components=self.no_clusters, covariance_type='diag') gmm.fit(self.classifier_helper.descriptor_vstack) fv_dim = self.no_clusters + 2 * self.no_clusters * self.classifier_helper.descriptor_vstack.shape[ 1] print(fv_dim) n_videos = train.shape[0] features = np.array([np.zeros(fv_dim) for i in range(n_videos)]) count = 0 for i in range(n_videos): len_video = len(self.descriptor_list[i]) fv = fisher_vector( self.classifier_helper.descriptor_vstack[count:count + len_video], gmm) features[i] = fv count += len_video print(features.shape) print('Data normalization.') scaler = StandardScaler() # train normalization features = scaler.fit_transform(features) features = power_normalize(features, 0.5) features = L2_normalize(features) # real label l.extend([self.number_dict[feature_test_file.split(os.sep)[-2]]]) # test features feature_test = self.file_helper.loadFeaturesFromFile( feature_test_file) test_fv = fisher_vector(feature_test, gmm) # train normalization test_fv = test_fv.reshape(1, -1) test_fv = scaler.transform(test_fv) test_fv = power_normalize(test_fv, 0.5) test_fv = L2_normalize(test_fv) # train classifiers for name, clf in zip(names, classifiers): print(name) clf.fit(features, self.train_labels) cl = int(clf.predict(test_fv)[0]) class_name_predict = self.name_dict[str(cl)] if class_name_test == class_name_predict: hits[name] += 1 # predicted label p[name].extend([cl]) predictions[name].append({ 'image': feature_test_file, 'class': cl, 'object_name': self.name_dict[str(cl)] }) msg_progress = '' for name in names: msg_progress += 'Classifier: %s - Hits:%i/%i - Accuracy: %.4f\n' % ( name.ljust(20), hits[name], c, hits[name] / c) print(msg_progress) print('\n\n') if c == 1 or c % 25 == 0: self.mail_helper.sendMail( "Progress: %s - %s" % (self.test_name, self.OsName), msg_progress) for name in names: print(name) self.saveResults(predictions[name], p[name], l, features_nd.shape[0], classifier_name=name)
def __do_perform(self, custom_out=None, main_experiment=None): if custom_out is not None: # if not os.path.exists(custom_out): # os.makedirs(custom_out) self._old_out = self._out self._out = custom_out elif self._old_out is not None: self._out = self._old_out if main_experiment is not None: self.log("Performing {} as part of {}".format( self.experiment_name(), main_experiment.experiment_name())) else: self.log("Performing {}".format(self.experiment_name())) # Adapted from https://github.com/JonathanTay/CS-7641-assignment-3/blob/master/clustering.py # %% Data for 1-3 sse = defaultdict(list) ll = defaultdict(list) bic = defaultdict(list) sil = defaultdict(lambda: defaultdict(list)) sil_s = np.empty(shape=(2 * len(self._clusters) * self._details.ds.training_x.shape[0], 4), dtype='<U21') acc = defaultdict(lambda: defaultdict(float)) adj_mi = defaultdict(lambda: defaultdict(float)) km = kmeans(random_state=self._details.seed) gmm = GMM(random_state=self._details.seed) st = clock() j = 0 for k in self._clusters: km.set_params(n_clusters=k) gmm.set_params(n_components=k) km.fit(self._details.ds.training_x) gmm.fit(self._details.ds.training_x) km_labels = km.predict(self._details.ds.training_x) gmm_labels = gmm.predict(self._details.ds.training_x) sil[k]['Kmeans'] = sil_score(self._details.ds.training_x, km_labels) sil[k]['GMM'] = sil_score(self._details.ds.training_x, gmm_labels) km_sil_samples = sil_samples(self._details.ds.training_x, km_labels) gmm_sil_samples = sil_samples(self._details.ds.training_x, gmm_labels) # There has got to be a better way to do this, but I can't brain right now for i, x in enumerate(km_sil_samples): sil_s[j] = [k, 'Kmeans', round(x, 6), km_labels[i]] j += 1 for i, x in enumerate(gmm_sil_samples): sil_s[j] = [k, 'GMM', round(x, 6), gmm_labels[i]] j += 1 sse[k] = [km.score(self._details.ds.training_x)] ll[k] = [gmm.score(self._details.ds.training_x)] bic[k] = [gmm.bic(self._details.ds.training_x)] acc[k]['Kmeans'] = cluster_acc(self._details.ds.training_y, km_labels) acc[k]['GMM'] = cluster_acc(self._details.ds.training_y, gmm_labels) adj_mi[k]['Kmeans'] = ami(self._details.ds.training_y, km_labels) adj_mi[k]['GMM'] = ami(self._details.ds.training_y, gmm_labels) self.log("Cluster: {}, time: {}".format(k, clock() - st)) sse = (-pd.DataFrame(sse)).T sse.index.name = 'k' sse.columns = ['{} sse (left)'.format(self._details.ds_readable_name)] ll = pd.DataFrame(ll).T ll.index.name = 'k' ll.columns = [ '{} log-likelihood'.format(self._details.ds_readable_name) ] bic = pd.DataFrame(bic).T bic.index.name = 'k' bic.columns = ['{} BIC'.format(self._details.ds_readable_name)] sil = pd.DataFrame(sil).T sil_s = pd.DataFrame(sil_s, columns=['k', 'type', 'score', 'label']).set_index('k') #.T # sil_s = sil_s.T acc = pd.DataFrame(acc).T adj_mi = pd.DataFrame(adj_mi).T sil.index.name = 'k' sil_s.index.name = 'k' acc.index.name = 'k' adj_mi.index.name = 'k' sse.to_csv(self._out.format('{}_sse.csv'.format( self._details.ds_name))) ll.to_csv( self._out.format('{}_logliklihood.csv'.format( self._details.ds_name))) bic.to_csv(self._out.format('{}_bic.csv'.format( self._details.ds_name))) sil.to_csv( self._out.format('{}_sil_score.csv'.format(self._details.ds_name))) sil_s.to_csv( self._out.format('{}_sil_samples.csv'.format( self._details.ds_name))) acc.to_csv(self._out.format('{}_acc.csv'.format( self._details.ds_name))) adj_mi.to_csv( self._out.format('{}_adj_mi.csv'.format(self._details.ds_name))) # %% NN fit data (2,3) grid = { 'km__n_clusters': self._clusters, 'NN__alpha': self._nn_reg, 'NN__hidden_layer_sizes': self._nn_arch } mlp = MLPClassifier(activation='relu', max_iter=2000, early_stopping=True, random_state=self._details.seed) km = kmeans(random_state=self._details.seed, n_jobs=self._details.threads) pipe = Pipeline([('km', km), ('NN', mlp)], memory=experiments.pipeline_memory) gs, _ = self.gs_with_best_estimator(pipe, grid, type='kmeans') self.log("KMmeans Grid search complete") tmp = pd.DataFrame(gs.cv_results_) tmp.to_csv( self._out.format('{}_cluster_kmeans.csv'.format( self._details.ds_name))) grid = { 'gmm__n_components': self._clusters, 'NN__alpha': self._nn_reg, 'NN__hidden_layer_sizes': self._nn_arch } mlp = MLPClassifier(activation='relu', max_iter=2000, early_stopping=True, random_state=self._details.seed) gmm = CustomGMM(random_state=self._details.seed) pipe = Pipeline([('gmm', gmm), ('NN', mlp)], memory=experiments.pipeline_memory) gs, _ = self.gs_with_best_estimator(pipe, grid, type='gmm') self.log("GMM search complete") tmp = pd.DataFrame(gs.cv_results_) tmp.to_csv( self._out.format('{}_cluster_GMM.csv'.format( self._details.ds_name))) # %% For chart 4/5 self._details.ds.training_x2D = TSNE( verbose=10, random_state=self._details.seed).fit_transform( self._details.ds.training_x) ds_2d = pd.DataFrame(np.hstack( (self._details.ds.training_x2D, np.atleast_2d(self._details.ds.training_y).T)), columns=['x', 'y', 'target']) ds_2d.to_csv( self._out.format('{}_2D.csv'.format(self._details.ds_name))) self.log("Done")
def gmm(nclusters, coords, n_init=50, n_iter=500): est = GMM(n_components=nclusters, n_init=n_init, n_iter=n_iter) est.fit(coords) return Partition(est.predict(coords))
# CWLM MODEL TRAINING K = 3 cwlm = CWLM(n_components=K, eta=7, plot=True, n_init=10, tol=1e-10, init_params='gmm', smoothing=False) cwlm.fit(X, y) mu_est = cwlm.means_ mu_ext = np.concatenate((np.ones((K, 1)), mu_est), axis=1) W_clust = cwlm.reg_weights_ y_, score = cwlm.predict_score(X_tst, y_tst) print('\nR2 score = {}'.format(score)) # GMM TRAINING gmm = GMM(n_components=K) gmm.fit(X) # Depict the likelyhoods on Y for the clusterwise linear model rx = np.ones((100, 2)) r = np.linspace(X.min(), X.max(), 100) rx[:, 1] = r c = ['r', 'g', 'b'] fig = plt.figure(1) y_est = [] # Depict all clustering approaches for clusterwise linear model fig = plt.figure(2)
print "Raw ", rf.feature_importances_ print "Normed ", rf_norm.feature_importances_ print "Extra ", rf_extra.feature_importances_ """ #----------------------------------------------------------------------- ############################### ### Gaussian Mixture Models ### ############################### #Initialize index at which we want to split training and cross-validation data sets i_divide = 10000 #Initialize GMM object gmm_2 = GMM(n_components=2, covariance_type='full') gmm_3 = GMM(n_components=4, covariance_type='full') #Fit the data gmm_2.fit(X_norm[:i_divide]) gmm_3.fit(X_norm[:i_divide]) #Print means of the fit print headers[headers != 'BSN'] print "2 models ", gmm_2.means_ print "3 models ", gmm_3.means_ #Print BIC and score print "2 models ", gmm_2.bic(X_norm[i_divide:]) print "3 models ", gmm_3.bic(X_norm[i_divide:])
#patch: <class 'numpy.ndarray'>; (16020,); need to reshape it to (16020,1) #Create ndarray of the data from above lists patch = np.stack(patch, axis=0).reshape(-1, 1) x = np.stack(x, axis=0).reshape(-1, 1) y = np.stack(y, axis=0).reshape(-1, 1) d = np.stack(d, axis=0).reshape(-1, 1) sad = np.stack(sad, axis=0).reshape(-1, 1) blm = np.stack(blm, axis=0).reshape(-1, 1) sbm = np.stack(sbm, axis=0).reshape(-1, 1) gcs = np.stack(gcs, axis=0).reshape(-1, 1) gmm = GMM( n_components=3, max_iter=100000, tol=1e-10, covariance_type='full', random_state=50, ).fit(patch) #label: <class 'numpy.ndarray'>; (16020,); need to reshape it to (16020,1) label = gmm.predict(patch.reshape(-1, 1)) label = label.reshape(-1, 1) pat_lab = np.column_stack((x, y, d, patch, label, sad, blm, sbm, gcs)) print(type(pat_lab)) print(pat_lab.shape) np.save('Patch9_Lab', pat_lab) np.savetxt('Patch_Labs.csv', pat_lab, delimiter=',')
def Fit(self, Mask): optionDict = self.opts.__dict__ self.toFit = GMM(**optionDict) return self.toFit.fit(self.MaskToMatrix(Mask))
res = np.array(res) res = res[idxs] images, residuals = get_images(res) values = latents elif clusteron == "embed": values = np.array([embed[0], embed[1]]).T #idxs = embed[3] #idxs = [int(idx) for idx in idxs] #reals = reals[idxs] #residuals = residuals[idx] #images, residuals = get_images(res) print(values.shape) #n_components = 2 print("Making GMM") gmm = GMM(n_components=n_components).fit(values) print("Predicting") labels = gmm.predict(values) #plt.scatter(X[:, 0], X[:, 1], c=labels, s=40, cmap='viridis') plt.scatter(embed[0], embed[1], c=labels, s=10, cmap='viridis') plt.savefig(plot_fn) def make_batch(arr, save_fn, n=128): arr = np.array(arr) ims = arr[:n] ims.reshape((-1, 96, 96, NBANDS)) saver.save_images(ims, save_fn) print("writing clusters")
covar_type = 'full' # you can try out 'diag' as well reps = 15 # number of fits with different initalizations, best result will be kept # Allocate variables BIC = np.zeros((T, 1)) AIC = np.zeros((T, 1)) CVE = np.zeros((T, 1)) # K-fold crossvalidation CV = cross_validation.KFold(N, 10, shuffle=True) for t, K in enumerate(KRange): print('Fitting model for K={0}\n'.format(K)) # Fit Gaussian mixture model gmm = GMM(n_components=K, covariance_type=covar_type, n_init=reps, params='wmc').fit(X) # Get BIC and AIC BIC[t, 0] = gmm.bic(X) AIC[t, 0] = gmm.aic(X) cds = gmm.means_ # extract cluster centroids (means of gaussians) print(cds) cls = gmm.predict(X) # extract cluster labels print(cls) # For each crossvalidation fold for train_index, test_index in CV: # extract training and test set for current CV fold X_train = X[train_index] X_test = X[test_index] # Fit Gaussian mixture model to X_train gmm = GMM(n_components=K,
# plt.ylabel('WCSS') # plt.rc('xtick', labelsize=10) # plt.rc('ytick', labelsize=10) # plt.savefig(os.getcwd() + '/plots/' + str(n_latent)+"d_"+ 'k-means_WCSS_clusters' +'.png') # plt.close() # Running and plotting k-means clustering with different cluster numbers kmeans_plot(par[:, 1:], X, 2) kmeans_plot(par[:, 1:], X, 3) kmeans_plot(par[:, 1:], X, 4) #GMM cluster - similar method as k-means for clustering, assigning labels, ordering labels and saving to lists. n_components = np.arange(1, 11, 1) cov_type = 'diag' models = [ GMM(n, covariance_type=cov_type, random_state=0).fit(X) for n in n_components ] aic_list.append([m.aic(X) for m in models]) bic_list.append([m.bic(X) for m in models]) # fig = plt.figure(161, figsize=(12,10)) # plt.plot(n_components, [m.bic(X) for m in models], label='BIC') # plt.plot(n_components, [m.aic(X) for m in models], label='AIC') # plt.legend(loc='best') # plt.xlabel('Number of Components'); # plt.rc('xtick', labelsize=10) # plt.rc('ytick', labelsize=10) # fig.suptitle('GMM BIC/AIC values', fontsize=16) # plt.savefig(os.getcwd() + '/plots/' + str(n_latent)+"d_"+'gmm_bic-aic_'+cov_type+'.png') # plt.close()
# (25%) sets. skf = StratifiedKFold(iris.target, n_folds=4) # Only take the first fold. train_index, test_index = next(iter(skf)) X_train = iris.data[train_index] y_train = iris.target[train_index] X_test = iris.data[test_index] y_test = iris.target[test_index] n_classes = len(np.unique(y_train)) # Try GMMs using different types of covariances. classifiers = dict((covar_type, GMM(n_components=n_classes, covariance_type=covar_type, init_params='wc', n_iter=20)) for covar_type in ['spherical', 'diag', 'tied', 'full']) n_classifiers = len(classifiers) plt.figure(figsize=(3 * n_classifiers / 2, 6)) plt.subplots_adjust(bottom=.01, top=0.95, hspace=.15, wspace=.05, left=.01, right=.99) for index, (name, classifier) in enumerate(classifiers.items()): # Since we have class labels for the training data, we can
rng = np.random.RandomState(13) X_stretched = np.dot(X, rng.randn(2, 2)) kmeans = KMeans(n_clusters=4, random_state=0) plot_kmeans(kmeans, X_stretched) plt.show() print("#---------------------------------------#") print(" Generalizing E-M: Gaussian mixture models") print(" expectation-maximization approach ") print("#---------------------------------------#") print("\n") from sklearn.mixture import GMM gmm = GMM(n_components=4).fit(X) labels = gmm.predict(X) plt.scatter(X[:, 0], X[:, 1], c=labels, s=40, cmap="viridis") plt.show() probs = gmm.predict_proba(X) print(probs[:5].round(3)) from matplotlib.patches import Ellipse def draw_ellipse(position, covariance, ax=None, **kwargs): ''' Draw a ellipse with a give position and covariance ''' ax = ax or plt.gca()
def new_component_weighted_EM(comp_list, X, W, num_iter=5, debug_ax=None, clamp_existing_params=True): """ Given a comp_list, add a new component, fit with weighted em """ if True: # first component LLs go unchanged lnpdf_C, sample_C, lnpdf_Cplus, init_params = \ components.make_new_component_mixture_lnpdf(comp_list) ll_q0 = lnpdf_C(X) D = X.shape[1] # resample X's zs = np.random.choice(len(W), size=len(W), p=W/len(W)) X = X[zs,:] # initialize the new component --- this mean may be too conservative init_mu = np.mean(X, 0) #np.sum(X*W[:,None], 0) / np.sum(W) init_var = np.var(X, 0) # #init_var = np.sum((X-init_mu)**2 * W[:,None], 0) / np.sum(W) ##init_mu = sample_C(1000).mean(0) #init_var = np.var(sample_C(1000), 0) init_lnstd = .5*np.log(init_var) lam = np.concatenate([init_mu, init_lnstd]) #lam = comp_list[-1][1] #lam[:D] = X[np.argmax(W)] rho = .1 # em iterations for i in xrange(num_iter): if debug_ax is not None: ii, jj = DEBUG_I, DEBUG_J debug_ax.scatter(lam[:D][ii], lam[:D][jj], s=100, c='red') debug_ax.text(lam[:D][ii], lam[:D][jj], "iter %d"%i, fontsize=14) print "iter %d rho = %2.5f"%(i, rho) # compute lngamma_old = ln P(Z=0 | X) and lngamma_new ll_new = misc.mvn_diag_logpdf(X, lam[:D], lam[D:]) lnjoint = np.column_stack([ ll_q0 + np.log(1.-rho), ll_new + np.log(rho) ]) lngammas = lnjoint - scpm.logsumexp(lnjoint, 1, keepdims=True) ll_marg = scpm.logsumexp(lnjoint, 1).mean() print "iter %d ll = %2.5f"%(i, ll_marg) #print " rho = ", rho, lngammas # weighted M step ws = np.exp(lngammas) #* W[:,None] rho_new = np.sum(ws, 0) + 1. rho = (rho_new / np.sum(rho_new))[1] #print ws new_mu = np.sum(X * ws[:,1,None], 0) / np.sum(ws[:,1,None]) new_var = np.sum((X-new_mu)**2 * ws[:,1,None], 0) / np.sum(ws[:,1,None]) lam = np.concatenate([new_mu, np.log(np.sqrt(new_var))]) return rho, lam # initialize model object num_comp = len(comp_list) + 1 mod = GMM(n_components=num_comp, covariance_type='diag', n_iter=1) mod.fit(X) # initialize all of the means, covs, pis existing_means, covars, _, _, _, pis = \ components.comp_list_to_matrices(comp_list) existing_covars = np.array([np.diag(c) for c in covars]) # initialize model mod.weights_[:-1] = pis * .5 mod.weights_[-1] = .5 # set the first mean to be the location of the highest weighted w... # set covar to be marginal covariance for the whole thing mod.means_[-1,:] = X[W.argmax(), :] init_c = mog.mog_covariance(existing_means, covars, pis) mod.covars_[-1,:] = np.diag(init_c) def clamp_params(mod): mod.means_[:-1,:] = existing_means mod.covars_[:-1,:] = existing_covars #mod.covars_[-1,:] = np.diag(init_c) #todo experiment with clamping covariance! clamp_params(mod) # run importance weighted EM prev_ll = -np.inf for i in xrange(num_iter): if debug_ax is not None: ii, jj = DEBUG_I, DEBUG_J for mi in xrange(mod.means_.shape[0]): debug_ax.scatter(mod.means_[mi,ii], mod.means_[mi,jj], s=100, c='red') debug_ax.text(mod.means_[mi,ii], mod.means_[mi,jj], "iter %d"%i, fontsize=14) log_likelihoods, responsibilities = mod.score_samples(X) current_log_likelihood = log_likelihoods.mean() weighted_responsibilities = responsibilities * W[:,None] mod._do_mstep(X, weighted_responsibilities, mod.params, mod.min_covar) if clamp_existing_params: clamp_params(mod) if (current_log_likelihood - prev_ll) < (1e-10*np.abs(prev_ll)): print " current ll increase too small (after %d iters) "%i break print "ll = %2.4f"%current_log_likelihood prev_ll = current_log_likelihood return mod
def init_gmm(self, nb_gaussians): return GMM(n_components=nb_gaussians, covariance_type='full', random_state=self.seed, warm_start=self.warm_start, n_init=self.nb_em_init)
print(train.shape) print('Fitting PCA') pca = PCA(n_components=6, random_state=0).fit(train) train_pca = pca.transform(train) print('Fitted with explained variance of ', pca.explained_variance_ratio_.sum()) pickle.dump(pca, open(pcasave, 'wb')) print(train_pca.shape) train_time_fft = np.concatenate((train_pca, time_features ), axis = 1) print(train_time_fft.shape) print('Fitting GMM') gmm = GMM(n_components=int(args.n_clusters), random_state=0).fit(train_time_fft) # kmeans = KMeans(n_clusters=int(args.n_clusters), random_state=0).fit(train_time_fft) labels = gmm.predict(train_time_fft) db_score = (sklearn.metrics.davies_bouldin_score(train_time_fft, labels)) print("Converged: ", str(gmm.converged_) ) print('Fitted with DB score: ', db_score) pickle.dump(gmm, open(gmmsave, 'wb')) labels = np.reshape(labels, (fft_features.shape[0], fft_features.shape[1])) unique_elements, counts_elements = np.unique(labels, return_counts=True) print(labels.shape) print("Unique elements: ", unique_elements) print("Frequency: ", counts_elements) def list2string(list):
def main(dataset_names=None, estimator_type="gmm", mc_iterations=20, n_folds=5, n_ensemble=100, seed_num=42): if dataset_names is None: # All the datasets used in Li2014 datasets_li2014 = [ 'abalone', 'balance-scale', 'credit-approval', 'dermatology', 'ecoli', 'german', 'heart-statlog', 'hepatitis', 'horse', 'ionosphere', 'lung-cancer', 'libras-movement', 'mushroom', 'diabetes', 'landsat-satellite', 'segment', 'spambase', 'wdbc', 'wpbc', 'yeast' ] datasets_hempstalk2008 = [ 'diabetes', 'ecoli', 'glass', 'heart-statlog', 'ionosphere', 'iris', 'letter', 'mfeat-karhunen', 'mfeat-morphological', 'mfeat-zernike', 'optdigits', 'pendigits', 'sonar', 'vehicle', 'waveform-5000' ] datasets_others = [ 'diabetes', 'ecoli', 'glass', 'heart-statlog', 'ionosphere', 'iris', 'letter', 'mfeat-karhunen', 'mfeat-morphological', 'mfeat-zernike', 'optdigits', 'pendigits', 'sonar', 'vehicle', 'waveform-5000', 'scene-classification', 'tic-tac', 'autos', 'car', 'cleveland', 'dermatology', 'flare', 'page-blocks', 'segment', 'shuttle', 'vowel', 'zoo', 'abalone', 'balance-scale', 'credit-approval', 'german', 'hepatitis', 'lung-cancer' ] # Datasets that we can add but need to be reduced datasets_to_add = ['MNIST'] dataset_names = list( set(datasets_li2014 + datasets_hempstalk2008 + datasets_others)) # Diary to save the partial and final results diary = Diary(name='results_Li2014', path='results', overwrite=False, fig_format='svg') # Hyperparameters for this experiment (folds, iterations, seed) diary.add_notebook('parameters', verbose=True) # Summary for each dataset diary.add_notebook('datasets', verbose=False) # Partial results for validation diary.add_notebook('validation', verbose=True) # Final results diary.add_notebook('summary', verbose=True) columns = ['dataset', 'method', 'mc', 'test_fold', 'acc', 'logloss'] df = MyDataFrame(columns=columns) diary.add_entry('parameters', [ 'seed', seed_num, 'mc_it', mc_iterations, 'n_folds', n_folds, 'n_ensemble', n_ensemble, 'estimator_type', estimator_type ]) data = Data(dataset_names=dataset_names) for name, dataset in data.datasets.iteritems(): if name in ['letter', 'shuttle']: dataset.reduce_number_instances(0.1) export_datasets_description_to_latex(data, path=diary.path) for i, (name, dataset) in enumerate(data.datasets.iteritems()): np.random.seed(seed_num) dataset.print_summary() diary.add_entry('datasets', [dataset.__str__()]) for mc in np.arange(mc_iterations): skf = StratifiedKFold(dataset.target, n_folds=n_folds, shuffle=True) test_folds = skf.test_folds for test_fold in np.arange(n_folds): x_train, y_train, x_test, y_test = separate_sets( dataset.data, dataset.target, test_fold, test_folds) # Binary discriminative classifier sv = SVC(kernel='linear', probability=True) # Density estimator for the background check if estimator_type == "svm": gamma = 1.0 / x_train.shape[1] est = OneClassSVM(nu=0.1, gamma=gamma) elif estimator_type == "gmm": est = GMM(n_components=1) elif estimator_type == "gmm3": est = GMM(n_components=3) elif estimator_type == "mymvn": est = MyMultivariateNormal() # Multiclass discriminative model with one-vs-one binary class. ovo = OvoClassifier(base_classifier=sv) classifier = ConfidentClassifier(classifier=ovo, estimator=est, mu=0.5, m=0.5) ensemble = Ensemble(base_classifier=classifier, n_ensemble=n_ensemble) # classifier = ConfidentClassifier(classifier=sv, # estimator=est, mu=0.5, # m=0.5) # ovo = OvoClassifier(base_classifier=classifier) # ensemble = Ensemble(base_classifier=ovo, # n_ensemble=n_ensemble) xs_bootstrap, ys_bootstrap = ensemble.fit(x_train, y_train) accuracy = ensemble.accuracy(x_test, y_test) log_loss = ensemble.log_loss(x_test, y_test) diary.add_entry('validation', [ 'dataset', name, 'method', 'our', 'mc', mc, 'test_fold', test_fold, 'acc', accuracy, 'logloss', log_loss ]) df = df.append_rows( [[name, 'our', mc, test_fold, accuracy, log_loss]]) # Li2014: EP-CC model # The classification confidence is used in learning the weights # of the base classifier as well as in weighted voting. ensemble_li = Ensemble(n_ensemble=n_ensemble, lambd=1e-8) ensemble_li.fit(x_train, y_train, xs=xs_bootstrap, ys=ys_bootstrap) accuracy_li = ensemble_li.accuracy(x_test, y_test) log_loss_li = ensemble_li.log_loss(x_test, y_test) diary.add_entry('validation', [ 'dataset', name, 'method', 'Li2014', 'mc', mc, 'test_fold', test_fold, 'acc', accuracy_li, 'logloss', log_loss_li ]) df = df.append_rows( [[name, 'Li2014', mc, test_fold, accuracy_li, log_loss_li]]) export_summary(df, diary)
pop_all = pd.read_csv("/neurospin/brainomics/2016_schizConnect/analysis/NUSDAST/\ VBM/population.csv") k_range = [1,2,3,5,8] #mod = KMeans(n_clusters=3) #mod = AgglomerativeClustering(n_clusters=4) #mod.fit(U_all_scz[:,k_range]) #labels_all_scz = mod.labels_ mod = GMM(n_components=3) labels_all_scz = mod.fit_predict(U_all_scz[:,k_range]) df = pd.DataFrame() df["labels"] = labels_all_scz df["age"] = pop_all["age"].values[y_all==1] df["sex"] = pop_all["sex_num"].values[y_all==1] for i in (k_range): df["U%s"%i] = U_all_scz[:,i] LABELS_DICT = {0: "cluster 1", 1: "cluster 2", 2: "cluster 3"} df["labels_name"] = df["labels"].map(LABELS_DICT)
features = mfcc.mfcc(audio, sr, 0.025, 0.02, 13, appendEnergy=False) # python_speech_features.base.mfcc(signal, samplerate=16000, winlen=0.025, winstep=0.01, numcep=13, nfilt=26, nfft=512, lowfreq=0, highfreq=None, preemph=0.97, ceplifter=22, appendEnergy=True, winfunc=<function <lambda>>) print(features.shape) features = preprocessing.scale(features) print(features) return features source = "Dataset/Sad" dest = "Model/" files = [ os.path.join(source, f) for f in os.listdir(source) if f.endswith('.wav') ] features = np.asarray(()) for f in files: sr, audio = read(f) vector = get_MFCC(sr, audio) if features.size == 0: features = vector else: features = np.vstack((features, vector)) gmm = GMM(n_components=8, max_iter=200, covariance_type='diag', n_init=3) gmm.fit(features) picklefile = f.split("/")[-2].split(".wav")[0] + ".gmm" # model saved as male.gmm cPickle.dump(gmm, open(dest + picklefile, 'w')) print 'modeling completed for emotion:', picklefile
#print(videoPath) #fourcc = cv2.VideoWriter_fourcc('X','V','I','D') #video = cv2.VideoWriter(videoPath, fourcc,20,(width,height)) # #for i,t in enumerate(movieTimes): # imgPath = os.path.join("./img/" + str(i) + ".png") # img = cv2.imread(imgPath) # img = cv2.cvtColor(img,cv2.COLOR_RGB2BGR) # video.write(img) #print('done') #cv2.destroyAllWindows() #video.release() #Gaussian Mixture Model #fit GMM gmm = GMM(n_components=2).fit(CC_scaled) cov = gmm.covariances_ prob_distr = gmm.predict_proba(CC_scaled) # determine to which of the two gaussians each data point belongs by looking at probability distribution if gmm.weights_[0] < gmm.weights_[1]: gauss1_idx = [ i for i in range(len(prob_distr)) if prob_distr[i][0] >= prob_distr[i][1] ] gauss2_idx = [ j for j in range(len(prob_distr)) if prob_distr[j][1] >= prob_distr[j][0] ] else: gauss1_idx = [
from sklearn.mixture import GMM import numpy as np import pandas as pd from sklearn import metrics balance_data = pd.read_csv('seeds.csv',sep= ',', header= None) X = balance_data.values[:, 0:6] Y = balance_data.values[:,7] estimator = GMM(n_components=3) estimator.fit(X) Y_pred=estimator.predict(X) print("Accuracy:",metrics.adjusted_rand_score(Y, Y_pred)) np.savetxt('em.csv',Y_pred)
import csv from sklearn.mixture import GMM # This file is to draw distribution curve y_true = list() y_predicted = list() with codecs.open("./codelabel_result/bi2vec_spring_8.csv", "r") as f_csv: reader = csv.reader(f_csv) for i, row in enumerate(reader): if i == 0: continue if int(row[2]) == 1 or int(row[2]) == -1: y_true.append(int(row[2])) predicted = -1 print row[5] if float(row[5]) > 0.3: predicted = 1 y_predicted.append(predicted) # print len(y_true) # np_y_true = np.array(y_true) # print np_y_true # np_y_predicted = np.array(y_predicted) # print np_y_predicted # print precision_score(np_y_true,np_y_predicted,average="weighted") # print recall_score(np_y_true,np_y_predicted,average="weighted") GMM = GMM(n_components=2, init_params="wc", n_iter=20)
# computes accuracy given the predictions and real labels def accuracy(predictions, labels): batch_size = predictions.shape[0] sum = np.sum(predictions == labels) acc = (100.0 * sum) / batch_size return acc n_classes = 10 # 10 genre classes # Try GMMs using different types of covariances. I'm only letting 'full' as it performs better but can add different types to try classifiers = dict((covar_type, GMM(n_components=n_classes, covariance_type=covar_type, init_params='wc', n_iter=5)) for covar_type in ['full']) print("Training GMM") for index, (name, classifier) in enumerate(classifiers.items()): # Since we have class labels for the training data, we can # initialize the GMM parameters in a supervised manner. classifier.means_ = np.array( [train_data[train_labels == i].mean(axis=0) for i in range(n_classes)]) # Train the other parameters using the EM algorithm. classifier.fit(train_data) # getting predictions of training set train_predictions = classifier.predict(train_data)
def wemd_from_pred_samples(y_pred, ): gmm = GMM(covariance_type="diag") gmm = gmm.fit(y_pred) y_s, _ = gmm.sample(len(y_pred)) return wemd_from_samples(y_s, y_pred)
def GMM_func(X_train, Y_train, X_test, Y_test, n_classes, show_results=False, fplt=False, colors='rgbym', select_classifier=2): temp1 = X_train[:, 0] temp2 = X_train[:, 1] temp1 = np.reshape(temp1, (X_train.shape[0], 1)) temp2 = np.reshape(temp2, (X_train.shape[0], 1)) mean1 = np.array([temp1[Y_train == i].mean() for i in range(n_classes)]) mean2 = np.array([temp2[Y_train == i].mean() for i in range(n_classes)]) #print mean1 #print mean2 mean_vector = np.zeros((n_classes, 2)) mean_vector[:, 0] = mean1 mean_vector[:, 1] = mean2 #print mean_vector # Try GMMs using different types of classifiers. # Since we have class labels for the training data, we can # initialize the GMM parameters in a supervised manner. if select_classifier == 1: classifier1 = GMM(n_components=n_classes, covariance_type='full', init_params='wc', n_iter=200) classifier1.means_ = mean_vector classifier1.fit(X_train) if fplt: w_factor = 0.5 / classifier1.weights_.max() for pos, covar, w, color in zip(classifier1.means_, classifier1.covars_, classifier1.weights_, colors): draw_ellipse(pos, covar, alpha=w * w_factor, clr=color) Y_pred = classifier1.predict(X_test) if select_classifier == 2: classifier2 = GaussianMixture(n_components=n_classes, means_init=mean_vector, covariance_type='full', max_iter=5000) classifier2.fit(X_train) if fplt: w_factor = 0.8 / classifier2.weights_.max() for pos, covar, w, color in zip(classifier2.means_, classifier2.covariances_, classifier2.weights_, colors): draw_ellipse(pos, covar, alpha=w * w_factor, clr=color) Y_pred = classifier2.predict(X_test) Y_pred = np.reshape(Y_pred, (Y_test.shape[0], 1)) Y_pred[Y_pred != 0] = 1 Y_test[Y_test != 0] = 1 #print Y_pred #print Y_test confusion = confusion_matrix(Y_test, Y_pred) eps = 1e-9 # print confusion accuracy = 0 if float(np.sum(confusion)) != 0: accuracy = float(confusion[0, 0] + confusion[1, 1]) / float( np.sum(confusion)) specificity = float( confusion[0, 0]) / float(confusion[0, 0] + confusion[0, 1] + eps) sensitivity = float( confusion[1, 1]) / float(confusion[1, 1] + confusion[1, 0] + eps) precision = float( confusion[1, 1]) / float(confusion[1, 1] + confusion[0, 1] + eps) if show_results: print("\nGlobal Accuracy: " + str(accuracy)) print("Specificity: " + str(specificity)) print("Sensitivity: " + str(sensitivity)) print("Precision: " + str(precision)) return accuracy
def trainModelFV_LOOCV_Fusion(self, extension='*.*'): """ This method contains the entire module required for training the Bag of Poses model Use of helper functions will be extensive. """ self.name_dict, self.number_dict, self.count_class = self.file_helper.getLabelsFromFile( self.label_path) # read file. prepare file lists. self.files1, self.trainFilesCount1 = self.file_helper.getFilesFromDirectory( self.base_path, self.datasets, extension) self.files2, self.trainFilesCount2 = self.file_helper.getFilesFromDirectory( self.base_path2, self.datasets, extension) save = True self.parameters += 'Classifier Parameters\n' self.parameters += '%s' % self.classifier_helper.clf features_nd1 = np.asarray(self.files1) features_nd2 = np.asarray(self.files2) features_nd1.sort(axis=0) features_nd2.sort(axis=0) # build GMMs self.descriptor_list1 = [] self.descriptor_list2 = [] for f in features_nd1: feature = f[0] des1 = self.file_helper.loadFeaturesFromFile(feature) self.descriptor_list1.append(des1) for f in features_nd2: feature = f[0] des2 = self.file_helper.loadFeaturesFromFile(feature) self.descriptor_list2.append(des2) ft1 = self.classifier_helper.formatND(self.descriptor_list1) ft2 = self.classifier_helper.formatND(self.descriptor_list2) gmm1 = GMM(n_components=self.no_clusters, covariance_type='diag', verbose=0) gmm1.fit(ft1) gmm2 = GMM(n_components=self.no_clusters, covariance_type='diag', verbose=0) gmm2.fit(ft2) # Train Classifier loo = LeaveOneOut() predictions = [] pre = [] lab = [] hits = 0 c = 0 for train, test in loo.split(features_nd1): feature_test_file1 = str(features_nd1[test][0][0]) feature_test_file2 = str(features_nd2[test][0][0]) class_name_test = feature_test_file1.split(os.sep)[-2] c += 1 currenInvDate = datetime.datetime.now().strftime( "%d/%m/%Y %H:%M:%S") print('Step: %i/%i - %s\n%s\n%s' % (c, features_nd1.shape[0], currenInvDate, feature_test_file1, feature_test_file2)) if c == 1 or c % 25 == 0: self.mail_helper.sendMail( "Progress: %s - %s" % (self.test_name, self.OsName), "Samples processed: %i" % c) self.descriptor_list1 = [] self.descriptor_list2 = [] self.train_labels = [] for feature in features_nd1[train]: feature = feature[0] label_number = self.number_dict[feature.split(os.sep)[-2]] self.train_labels = np.append(self.train_labels, label_number) des1 = self.file_helper.loadFeaturesFromFile(feature) self.descriptor_list1.append(des1) for feature in features_nd2[train]: feature = feature[0] des2 = self.file_helper.loadFeaturesFromFile(feature) self.descriptor_list2.append(des2) # format data as nd array ft1 = self.classifier_helper.formatND(self.descriptor_list1) ft2 = self.classifier_helper.formatND(self.descriptor_list2) fv_dim1 = self.no_clusters + 2 * self.no_clusters * ft1.shape[1] fv_dim2 = self.no_clusters + 2 * self.no_clusters * ft2.shape[1] print(fv_dim1, fv_dim2) n_videos = train.shape[0] features1 = np.array([np.zeros(fv_dim1) for i in range(n_videos)]) features2 = np.array([np.zeros(fv_dim2) for i in range(n_videos)]) count1 = 0 count2 = 0 for i in range(n_videos): len_video1 = len(self.descriptor_list1[i]) fv1 = fisher_vector(ft1[count1:count1 + len_video1], gmm1) features1[i] = fv1 count1 += len_video1 len_video2 = len(self.descriptor_list2[i]) fv2 = fisher_vector(ft2[count2:count2 + len_video2], gmm2) features2[i] = fv2 count2 += len_video2 print(features1.shape) print('Data normalization. 1') scaler1 = StandardScaler() # train normalization features1 = scaler1.fit_transform(features1) features1 = power_normalize(features1, 0.5) features1 = L2_normalize(features1) print(features2.shape) print('Data normalization. 2') scaler2 = StandardScaler() # train normalization features2 = scaler2.fit_transform(features2) features2 = power_normalize(features2, 0.5) features2 = L2_normalize(features2) # real label lab.extend( [self.number_dict[feature_test_file1.split(os.sep)[-2]]]) # test features 1 feature_test1 = self.file_helper.loadFeaturesFromFile( feature_test_file1) test_fv1 = fisher_vector(feature_test1, gmm1) # train normalization test_fv1 = test_fv1.reshape(1, -1) test_fv1 = scaler1.transform(test_fv1) test_fv1 = power_normalize(test_fv1, 0.5) test_fv1 = L2_normalize(test_fv1) # test features 2 feature_test2 = self.file_helper.loadFeaturesFromFile( feature_test_file2) test_fv2 = fisher_vector(feature_test2, gmm2) # train normalization test_fv2 = test_fv2.reshape(1, -1) test_fv2 = scaler2.transform(test_fv2) test_fv2 = power_normalize(test_fv2, 0.5) test_fv2 = L2_normalize(test_fv2) ## concatenate two fv test feature_test = np.concatenate((test_fv1, test_fv2), axis=1).reshape(1, -1) ## concatenate two fv train feature_train = np.concatenate((features1, features2), axis=1) # train classifiers self.classifier_helper.clf.fit(feature_train, self.train_labels) cl = int(self.classifier_helper.clf.predict(feature_test)[0]) class_name_predict = self.name_dict[str(cl)] if class_name_test == class_name_predict: hits += 1 error = c - hits msg_progress = 'Hits: %i/%i - Accuracy: %.4f - Error: %i\n\n' % ( hits, c, hits / c, error) print(msg_progress) if c % 25 == 0: self.mail_helper.sendMail( "Progress: %s - %s" % (self.test_name, self.OsName), msg_progress) if error > 40: save = False print('Error excedded') break # predicted label pre.extend([cl]) predictions.append({ 'image1': feature_test_file1, 'image2': feature_test_file2, 'class': cl, 'object_name': self.name_dict[str(cl)] }) if save: self.saveResults(predictions, pre, lab, features_nd1.shape[0])
X = mog.mog_samples(1000, means, np.array([np.linalg.cholesky(c) for c in covars]), pis) W = np.ones(X.shape[0]) lam0 = np.concatenate([ means[0], .5*np.log(np.diag(covars[0])) ]) comp_list = [(1., lam0)] mod = new_component_weighted_EM(comp_list, X, W, num_iter=50) print "GT Means : ", means print "Inferred Means : ", mod.means_ print "Inferred Covars : ", mod.covars_ print "Inferred Weights: ", mod.weights_ print "\n" gmod = GMM(n_components=2) gmod.fit(X) print "full em means : ", gmod.means_ print "full em weights : ", gmod.weights_ print "full em covars : ", gmod.covars_ # test higher level method import matplotlib.pyplot as plt; plt.ion() import seaborn as sns import autil.util.plots as pu fig, ax = plt.figure(figsize=(8,8)), plt.gca() pu.plot_isocontours(ax, lambda x: np.exp(lnpdf(x,0)), fill=True) new_comp_list = fit_new_component(comp_list, lnpdf, df=10000, num_samples=1000, importance_dist='tmixture',
def FV_LOOCV_Features(self): """ This method contains the entire module required for training the bag of visual words model Use of helper functions will be extensive. """ self.name_dict, self.number_dict, self.count_class = self.file_helper.getLabelsFromFile( self.label_path) for count, base_path in enumerate(self.base_path): # read file. prepare file lists. self.images[count], self.trainImageCount[count] = \ self.file_helper.getFilesFromDirectory(base_path, self.datasets, self.features_file_filter[count]) features_nd = {} # Initialize features nd array for count, _ in enumerate(self.base_path): features_nd[count] = np.asarray(self.images[count]) features_nd[count].sort(axis=0) self.descriptor_list = {} labels_train = [] ft = {} gmm = {} fv_dim = {} features = {} n_videos = features_nd[0].shape[0] scaler_des = {} # Read features from files and compute Gaussian Mixture Models (GMM) for count, _ in enumerate(self.base_path): self.descriptor_list[count] = [] for feature in features_nd[count]: feature = feature[0] if count == 0: label_number = self.number_dict[feature.split(os.sep)[-2]] label_name = self.name_dict[str(label_number)] labels_train = np.append(labels_train, label_name) des = self.file_helper.loadFeaturesFromFile(feature) self.descriptor_list[count].append(des) ft[count] = self.classifier_helper.formatND( self.descriptor_list[count]) # train normalization scaler_des[count] = StandardScaler() ft[count] = scaler_des[count].fit_transform(ft[count]) gmm[count] = GMM(n_components=self.no_clusters, covariance_type='diag', verbose=0) gmm[count].fit(ft[count]) fv_dim[count] = self.no_clusters + 2 * self.no_clusters * ft[ count].shape[1] print(fv_dim[count]) features[count] = np.array( [np.zeros(fv_dim[count]) for i in range(n_videos)]) len_video = {} fv = {} scaler = {} # Compute Fisher Vector from Descriptors using GMM for count, _ in enumerate(self.base_path): count_videos = 0 for i in range(n_videos): len_video[count] = len(self.descriptor_list[count][i]) fv[count] = fisher_vector( ft[count][count_videos:count_videos + len_video[count]], gmm[count]) features[count][i] = fv[count] count_videos += len_video[count] # Perform FV Normalization for count, _ in enumerate(self.base_path): print(features[count].shape) print('Data normalization. %i' % count) scaler[count] = StandardScaler() # train normalization features[count] = scaler[count].fit_transform(features[count]) features[count] = power_normalize(features[count], 0.5) features[count] = L2_normalize(features[count]) # Concatenate FV for each feature type feature_train = features[0] for count in range(1, len(self.base_path)): feature_train = np.concatenate((feature_train, features[count]), axis=1) return feature_train, labels_train
def trainingGMMHMM( dataset, # training dataset. n_c, # number of hmm's components (ie. hidden states) n_m, # number of gmm's mixtures (ie. Gaussian model) start_prob_prior=None, # prior of start hidden states probabilities. trans_mat_prior=None, # prior of transition matrix. start_prob=None, # the start hidden states probabilities. trans_mat=None, # the transition matrix. gmms=None, # models' params of gmm covar_type='full', n_i=50): # Initiation of dataset. # d = Dataset(dataset) X = dataset.getDataset() # Initiation of GMM. _GMMs = [] if gmms is None: _GMMs = None else: for gmm in gmms: _GMM = GMM(n_components=n_m, covariance_type=covar_type) _GMM.covars_ = np.array(gmm["covars"]) _GMM.means_ = np.array(gmm["means"]) _GMM.weights_ = np.array(gmm["weights"]) _GMMs.append(_GMM) # Initiation of GMMHMM. model = GMMHMM(startprob_prior=np.array(start_prob_prior), transmat_prior=np.array(trans_mat_prior), startprob=np.array(start_prob), transmat=np.array(trans_mat), gmms=_GMMs, n_components=n_c, n_mix=n_m, covariance_type=covar_type, n_iter=n_i) # Training. model.fit(X) # The result. new_gmmhmm = { "nComponent": n_c, "nMix": n_m, "covarianceType": covar_type, "hmmParams": { "startProb": model.startprob_.tolist(), "transMat": model.transmat_.tolist() }, "gmmParams": { "nMix": n_m, "covarianceType": covar_type, "params": [] } } for i in range(0, n_m): gaussian_model = { "covars": model.gmms_[i].covars_.tolist(), "means": model.gmms_[i].means_.tolist(), "weights": model.gmms_[i].weights_.tolist() } new_gmmhmm["gmmParams"]["params"].append(gaussian_model) return new_gmmhmm