def main(fname, N, n, params): """Run GMM EM on the data in @fname""" gmm = GaussianMixtureModel.from_file(fname) k, d, M, S, w = gmm.k, gmm.d, gmm.means, gmm.sigmas, gmm.weights X = gmm.sample(N, n) # Set seed for the algorithm sc.random.seed(int(params.seed)) algo = GaussianMixtureEM(k, d) O = M, S, w def report(i, O_, lhood): M_, _, _ = O_ lhood, Z, O_ = algo.run(X, None, report) M_, S_, w_ = O_ M_ = closest_permuted_matrix(M.T, M_.T).T # Table print column_aerr(M, M_), column_rerr(M, M_)
def main( prefix, N, n, delta, params ): """Run on sample in fname""" gmm = GaussianMixtureModel.from_file( prefix ) k, d, M, w = gmm.k, gmm.d, gmm.means, gmm.weights logger.add( "M", M ) logger.add_consts( "M", M, k, 2 ) logger.add( "w_min", w.min() ) logger.add( "w_max", w.max() ) X = gmm.sample( N, n ) logger.add( "k", k ) logger.add( "d", d ) logger.add( "n", n ) # Set seed for the algorithm sc.random.seed( int( params.seed ) ) logger.add( "seed", int( params.seed ) ) P, T = sample_moments( X, k ) Pe, Te = exact_moments( M, w ) start = time.time() M_ = recover_components( k, P, T, Pe, Te, delta = delta ) stop = time.time() logger.add( "time", stop - start ) M_ = closest_permuted_matrix( M.T, M_.T ).T logger.add( "M_", M ) # Error data logger.add_err( "M", M, M_ ) logger.add_err( "M", M, M_, 'col' ) print column_aerr(M, M_), column_rerr(M, M_)
def test_gaussian_em(): """Test the Gaussian EM on a small generated dataset""" fname = "gmm-3-10-0.7.npz" gmm = GaussianMixtureModel.generate(fname, 3, 3) k, d, M, S, w = gmm.k, gmm.d, gmm.means, gmm.sigmas, gmm.weights N, n = 1e6, 1e5 X = gmm.sample(N, n) algo = GaussianMixtureEM(k, d) def report(i, O_, lhood): M_, _, _ = O_ lhood, Z, O_ = algo.run(X, None, report) M_, S_, w_ = O_ M_ = closest_permuted_matrix(M, M_) w_ = closest_permuted_vector(w, w_) print w, w_ print norm(M - M_) / norm(M) print abs(S - S_).max() print norm(w - w_) assert (norm(M - M_) / norm(M) < 1e-1) assert (abs(S - S_) < 1).all() assert (norm(w - w_) < 1e-2)
def test_gaussian_em(): """Test the Gaussian EM on a small generated dataset""" fname = "gmm-3-10-0.7.npz" gmm = GaussianMixtureModel.generate( fname, 3, 3 ) k, d, M, S, w = gmm.k, gmm.d, gmm.means, gmm.sigmas, gmm.weights N, n = 1e6, 1e5 X = gmm.sample( N, n ) algo = GaussianMixtureEM(k, d) def report( i, O_, lhood ): M_, _, _ = O_ lhood, Z, O_ = algo.run( X, None, report ) M_, S_, w_ = O_ M_ = closest_permuted_matrix( M, M_ ) w_ = closest_permuted_vector( w, w_ ) print w, w_ print norm( M - M_ )/norm(M) print abs(S - S_).max() print norm( w - w_ ) assert( norm( M - M_ )/norm(M) < 1e-1 ) assert (abs(S - S_) < 1 ).all() assert( norm( w - w_ ) < 1e-2 )
def main(fname, N, n, params): """Run GMM EM on the data in @fname""" gmm = GaussianMixtureModel.from_file( fname ) k, d, M, S, w = gmm.k, gmm.d, gmm.means, gmm.sigmas, gmm.weights logger.add( "M", M ) X = gmm.sample( N, n ) logger.add( "k", k ) logger.add( "d", d ) logger.add( "n", n ) # Set seed for the algorithm sc.random.seed( int( params.seed ) ) logger.add( "seed", int( params.seed ) ) algo = GaussianMixtureEM( k, d ) O = M, S, w start = time.time() def report( i, O_, lhood ): M_, _, _ = O_ logger.add_err( "M_t%d" % (i), M, M_, 'col' ) logger.add( "time_%d" % (i), time.time() - start ) lhood, Z, O_ = algo.run( X, None, report ) logger.add( "time", time.time() - start ) M_, S_, w_ = O_ M_ = closest_permuted_matrix( M.T, M_.T ).T logger.add( "M_", M_ ) # Table logger.add_err( "M", M, M_, 2 ) logger.add_err( "M", M, M_, 'col' ) print column_aerr( M, M_ ), column_rerr( M, M_ )
def test_gaussian_em(): """Test the Gaussian EM on a small generated dataset""" fname = "./test-data/gmm-3-10-0.7.npz" gmm = GaussianMixtureModel.from_file( fname ) k, d, M, S, w = gmm.k, gmm.d, gmm.means, gmm.sigmas, gmm.weights N, n = 1e6, 1e4 X = gmm.sample( N, n ) algo = GaussianMixtureEM(k, d) start = time.time() def report( i, O_, lhood ): M_, _, _ = O_ logger.add_err( "M_t%d" % (i), M, M_, 'col' ) logger.add( "time_%d" % (i), time.time() - start ) lhood, Z, O_ = algo.run( X, None, report ) logger.add( "time", time.time() - start ) M_, S_, w_ = O_ M_ = closest_permuted_matrix( M, M_ ) w_ = closest_permuted_vector( w, w_ ) print norm( M - M_ )/norm(M) print abs(S - S_) print norm( w - w_ ) assert( norm( M - M_ )/norm(M) < 1e-1 ) assert( abs(S - S_) < 1 ) assert( norm( w - w_ ) < 1e-3 )
def compare_error_bounds( model_fname, log_fname, delta = 0.1 ): """Compare error bounds theoretical analysis""" gmm = GaussianMixtureModel.from_file( model_fname ) k, d, M, w = gmm.k, gmm.d, gmm.means, gmm.weights P, T = exact_moments( M, w ) lg = sc.load( log_fname ) # TODO: Use concentration bounds on aerr_P12 n_M, sk_M = lg["norm_M_2"], lg["s_k_M"], e_P, e_T = lg["aerr_P_2"], lg["aerr_T"], n_P, sk_P, n_T = lg["norm_Pe_2"], lg["s_k_P"], lg["norm_Te"] w_min = min(w) # TODO: Ah, not computing sigma2! # alpha_P and \beta_P a_P = e_P/sk_P b_P = a_P/(1-a_P) e_Wb = 2/sqrt(sk_P) * b_P e_W = lg["aerr_W_2"] e_Twb = 1/sqrt(sk_M * (1-a_P)) * e_T + n_T/sk_M * (1 + 1/sqrt(1-a_P) + 1/(1-a_P)) * e_W e_Tw = lg["aerr_Tw"] e_Lb = e_Tw e_L = lg["aerr_lambda"] D_M = column_sep( M ) D_Tw = delta/(sqrt(sc.e) * k**2 * (1+sqrt(2 * log(k/delta)))) * D_M e_vb = 4 * sqrt(2) * e_Tw / D_Tw e_v = lg["aerr_v_col"] e_Wtb = 2 * sqrt( n_P + e_P ) * b_P n_Wtb = sqrt( n_P + e_P ) e_mub = e_Lb + (1+1/sqrt(w_min)) * n_Wtb * e_vb + e_Wtb e_mu = lg["aerr_M_col"] print "A\t\tbound\t\tactual" print "W\t\t%f\t\t%f" % (e_Wb, e_W) print "Tw\t\t%f\t\t%f" % (e_Twb, e_Tw) print "L\t\t%f\t\t%f" % (e_Lb, e_L) print "v\t\t%f\t\t%f" % (e_vb, e_v) print "mu\t\t%f\t\t%f" % (e_mub, e_mu) return [(e_W/e_Wb), (e_Tw/e_Twb), (e_L / e_Lb), (e_v/e_vb), (e_mu / e_mub),]
def test_exact_recovery(): """Test the exact recovery of topics""" fname = "./test-data/gmm-3-10-0.7.npz" gmm = GaussianMixtureModel.from_file( fname ) k, d, A, w = gmm.k, gmm.d, gmm.means, gmm.weights P, T = exact_moments( A, w ) A_ = recover_components( k, P, T, P, T, delta = 0.01 ) A_ = closest_permuted_matrix( A.T, A_.T ).T print norm( A - A_ )/norm( A ) print A print A_ assert norm( A - A_ )/norm(A) < 1e-3
def check( k, d ): model = GaussianMixtureModel.generate( k, d ) M1, M2, M3 = model.means w = model.weights x1, x2, x3 = model.sample( 1e5 ) # Get the first moments of the data X1 = M1.dot( w ) X2 = M2.dot( w ) X3 = M3.dot( w ) X1_ = x1.mean( axis=0 ) X2_ = x2.mean( axis=0 ) X3_ = x3.mean( axis=0 ) err1 = norm( X1 - X1_) err2 = norm( X2 - X2_) err3 = norm( X3 - X3_) print err1, err2, err3 assert err1 < 1e-02 assert err2 < 1e-02 assert err3 < 1e-02 # Get pairwise estimates P12, P13, P123 = spectral.mixture.exact_moments( w, M1, M2, M3 ) P12_ = sd.Pairs( x1, x2 ) P13_ = sd.Pairs( x1, x3 ) err12 = norm( P12 - P12_) err13 = norm( P13 - P13_) print err12, err13 assert err12 < 1e-02 assert err13 < 1e-02 eta = sc.randn( d ) # Get triple estimates P123 = M1.dot( diag( M3.T.dot(eta) * w ).dot( M2.T ) ) P123_ = sd.Triples( x1, x2, x3, eta ) err123 = norm( P123 - P123_) print err123 assert norm( P123 - P123_) < 1e-01
def test_sample_recovery(): """Test the recovery of topics from samples""" fname = "./test-data/gmm-3-10-0.7.npz" gmm = GaussianMixtureModel.from_file( fname ) k, d, A, w = gmm.k, gmm.d, gmm.means, gmm.weights X = gmm.sample( 10**5 ) P, T = sample_moments( X, k ) Pe, Te = exact_moments( A, w ) del gmm A_ = recover_components( k, P, T, Pe, Te ) A_ = closest_permuted_matrix( A.T, A_.T ).T print norm( A - A_ )/norm( A ) print A print A_ assert norm( A - A_ )/norm( A ) < 5e-1
def main( fname, dataset_type, N, k, d, params ): """Generate dataset in file fname""" if dataset_type == "gmm": if params.cov == "spherical" and params.sigma2 > 0: params.cov = array( [params.sigma2 * eye(d)] * k ) gmm = GaussianMixtureModel.generate( fname, k, d, params.means, params.cov, params.weights ) gmm.sample( N ) gmm.save() elif dataset_type == "mvgmm": views = params.views if params.cov == "spherical" and params.sigma2 > 0: params.cov = array( [[params.sigma2 * eye(d)] * k] * views ) mvgmm = MultiViewGaussianMixtureModel.generate( fname, k, d, views, params.means, params.cov, params.weights ) mvgmm.sample( N ) mvgmm.save() else: raise NotImplementedError
def main(fname, N, n, params): """Run GMM EM on the data in @fname""" gmm = GaussianMixtureModel.from_file( fname ) k, d, M, S, w = gmm.k, gmm.d, gmm.means, gmm.sigmas, gmm.weights X = gmm.sample( N, n ) # Set seed for the algorithm sc.random.seed( int( params.seed ) ) algo = GaussianMixtureEM( k, d ) O = M, S, w def report( i, O_, lhood ): M_, _, _ = O_ lhood, Z, O_ = algo.run( X, None, report ) M_, S_, w_ = O_ M_ = closest_permuted_matrix( M.T, M_.T ).T # Table print column_aerr( M, M_ ), column_rerr( M, M_ )
def main(args): # Load data trainDataPed = np.load('data/processed/test_data_ped.npy').astype( np.float32) trainDataBic = np.load('data/processed/test_data_bic.npy').astype( np.float32) trainLabelPed = np.load('data/processed/test_label_ped.npy') trainLabelBic = np.load('data/processed/test_label_bic.npy') testDataPed = np.load('data/processed/train_data_ped.npy').astype( np.float32) testDataBic = np.load('data/processed/train_data_bic.npy').astype( np.float32) testLabelPed = np.load('data/processed/train_label_ped.npy') testLabelBic = np.load('data/processed/train_label_bic.npy') # Downsample by a factor of 2 trainDataPed_ds = utils.downsampler_2(trainDataPed) trainDataBic_ds = utils.downsampler_2(trainDataBic) testDataPed_ds = utils.downsampler_2(testDataPed) testDataBic_ds = utils.downsampler_2(testDataBic) # Vectorize "image" data trainDataPedVec = trainDataPed_ds.reshape((trainDataPed_ds.shape[0], -1), order='F') trainDataBicVec = trainDataBic_ds.reshape((trainDataBic_ds.shape[0], -1), order='F') testDataPedVec = testDataPed_ds.reshape((testDataPed_ds.shape[0], -1), order='F') testDataBicVec = testDataBic_ds.reshape((testDataBic_ds.shape[0], -1), order='F') # Check out the Downsampled data #mv.classification_data_visualizer(trainDataPedVec.reshape((trainDataPedVec.shape[0],200,72), order='F'), trainLabelPed) ## # --- Use for Feature Plots (PCA) #nFeatures = 16 ## PCA Feature Extraction -- Compute Features via PCA using Mean Centered Ped & Bic spectrograms #trainDataPedVecWeights, trainDataPedVecFeatures = pca.PCA(trainDataPedVec-np.mean(trainDataPedVec, axis=0), nFeatures) #trainDataBicVecWeights, trainDataBicVecFeatures = pca.PCA(trainDataBicVec-np.mean(trainDataBicVec, axis=0), nFeatures) # ## NMF Feature Extraction -- Compute Features via NMF using Mean Centered Ped & Bic spectrograms ## trainDataPedVecWeightsNMF, trainDataPedVecFeaturesNMF = pca.PCA(trainDataPedVec-np.mean(trainDataPedVec, axis=0), nFeatures) ## trainDataBicVecWeightsNMF, trainDataBicVecFeaturesNMF = pca.PCA(trainDataBicVec-np.mean(trainDataBicVec, axis=0), nFeatures) # ## Check out the features ## mv.classification_data_visualizer(trainDataPedVecFeatures.reshape((nFeatures,200,72), order='F'), np.array([str(i) for i in range(nFeatures)])) #mv.feature_viewer(trainDataPedVecFeatures.reshape((nFeatures,200,72), order='F'),nFeatures, trainDataPed_ds.shape[1], trainDataPed_ds.shape[2], title='Pedestrian Features') #mv.feature_viewer(trainDataBicVecFeatures.reshape((nFeatures,200,72), order='F'),nFeatures, trainDataBic_ds.shape[1], trainDataBic_ds.shape[2], title='Bike Features') ## # --- Use for Feature Plots (PCA) # ============================================================================= # 1) Gaussian Mixture Model # ============================================================================= if args.model == 'GMM': print("[GMM] Begin GMM Training & Testing") nFeatures = 16 nClasses = 2 # Produce full set fullSet = np.concatenate((trainDataPedVec, trainDataBicVec), axis=0) fullSetLabel = np.concatenate((trainLabelPed, trainLabelBic), axis=0) trainingDataMean = np.mean(fullSet, axis=0) weights, features = pca.PCA(fullSet - trainingDataMean, nFeatures) if args.see_features: print(args.see_features) mv.feature_viewer(features.reshape((nFeatures, 200, 72), order='F'), nFeatures, trainDataBic_ds.shape[1], trainDataBic_ds.shape[2], title='GMM Features') if args.see_weights: print(args.see_weights) mv.weight_viewer(weights, fullSetLabel) # Generate mean and covariance for bike and pedestrian class gmm_classifier = GMM.GaussianMixtureModel(fullSet, nFeatures, 2, 1000) results = gmm_classifier.fit(fullSet) # Make a decision decision = np.argmax(results, axis=0) decisionLabeled = [] for sample in decision: if sample == 0: decisionLabeled.append('ped ') elif sample == 1: decisionLabeled.append('bic ') decisionLabeled = np.array(decisionLabeled) # Calculate Statistics train_accuracy = np.mean(decisionLabeled == fullSetLabel) print("Training set accuracy: ", train_accuracy) # -- Now test testFullSet = np.concatenate((testDataPedVec, testDataBicVec), axis=0) testFullSetLabel = np.concatenate((testLabelPed, testLabelBic), axis=0) # Generate mean and covariance for bike and pedestrian class testResults = gmm_classifier.fit(testFullSet) # Make a decision testDecision = np.argmax(testResults, axis=0) testDecisionLabeled = [] for sample in testDecision: if sample == 0: testDecisionLabeled.append('ped ') elif sample == 1: testDecisionLabeled.append('bic ') testDecisionLabeled = np.array(testDecisionLabeled) # Calculate Statistics bike_correct = 0 bike_incorrect = 0 ped_correct = 0 ped_incorrect = 0 for i in range(len(testDecisionLabeled)): if testDecisionLabeled[i] == 'ped ': if testDecisionLabeled[i] == testFullSetLabel[i]: ped_correct += 1 else: ped_incorrect += 1 else: if testDecisionLabeled[i] == testFullSetLabel[i]: bike_correct += 1 else: bike_incorrect += 1 test_accuracy = np.mean(testDecisionLabeled == testFullSetLabel) print("[GMM] Testing set accuracy: ", test_accuracy) # ============================================================================= # 2) Convolutional Neural Net # ============================================================================= elif args.model == 'CNN': print("[CNN] Begin CNN Training & Testing") # Produce train set trainSet = np.concatenate((trainDataPed_ds, trainDataBic_ds), axis=0) trainSetLabel = np.concatenate((trainLabelPed, trainLabelBic), axis=0) # Convert train set to torch tensor trainSet = torch.tensor(trainSet, dtype=torch.float32) # Answer to the question: Is it a bike? trainSetLabel_binary = np.array( [int('bic ' == elem) for elem in trainSetLabel]) trainSetLabel_bool = np.array([('bic ' == elem) for elem in trainSetLabel]) # Produce test set testSet = np.concatenate((testDataPed_ds, testDataBic_ds), axis=0) testSetLabel = np.concatenate((testLabelPed, testLabelBic), axis=0) # Convert test set to torch tensor testSet = torch.tensor(testSet, dtype=torch.float32) # Answer to the question: Is it a bike? testSetLabel_bool = np.array( ['bic ' == elem for elem in testSetLabel]) train_flag = False if train_flag: _, net = CNN.fit(trainSet, trainSetLabel_binary, testSet, 10) else: loss_fn = torch.nn.CrossEntropyLoss() in_size = 0 out_size = 2 net = CNN.NeuralNet(0.03, loss_fn, in_size, out_size) net.load_state_dict(torch.load('net.model')) net.eval() batch_size = 10 # Begin - Train num_batch_train = trainSet.shape[0] // batch_size result_train = np.zeros((num_batch_train * batch_size, 2)) # Evaluate - Train for i in range(num_batch_train): result_train[i * 10:(i + 1) * 10] = net( trainSet[i * 10:(i + 1) * 10]).detach().numpy() # Decide - Train decision_train = np.array( [sample[0] < sample[1] for sample in result_train]) train_accuracy = np.mean(decision_train == trainSetLabel_bool) print("[CNN] Training set accuracy: ", train_accuracy) # Begin - Test num_batch = testSet.shape[0] // batch_size result = np.zeros((num_batch * batch_size, 2)) # Evaluate - Test for i in range(num_batch): result[i * 10:(i + 1) * 10] = net(testSet[i * 10:(i + 1) * 10]).detach().numpy() # Decide - Test decision = np.array([sample[0] < sample[1] for sample in result]) test_accuracy = np.mean(decision == testSetLabel_bool) print("[CNN] Testing set accuracy: ", test_accuracy)