def fit(self): l = len(self.samples) print "Data Size: ", l self.samples = numpy.asarray(self.samples) print "start fitting..." self.means, self.covars, self.weights, self.logl = pypr_GMM.em_gm( self.samples, K=self.n_comp, max_iter=self.n_iter, verbose=True, diag_add=1e-9 )
def fit(self): l = len(self.samples) print("Data Size: ", l) self.samples = numpy.asarray(self.samples) print("start fitting...") self.means, self.covars, \ self.weights, self.logl = pypr_GMM.em_gm(self.samples, K=self.n_comp, max_iter=self.n_iter, verbose=True, diag_add=1e-9)
def fit(self, data): ''' fit gmm from data ''' if self.k > 1: self.mu, self.sigma, self.pi, ll = gmm.em_gm(np.array(data), K=self.k) else: self.mu = [np.mean(data, axis=0)] self.sigma = [np.cov(data, rowvar=0)] self.pi = np.array([1.0]) self.updateInvSigma() return self
mGMM.bic(dataset), mGMM.score_samples(dataset)[0].sum())) ############## best: 12 components and full covariance import pypr.clustering.gmm as gmm def iter_plot(cen_lst, cov_lst, itr): # For plotting EM progress if itr % 2 == 0: for i in range(len(cen_lst)): x, y = gmm.gauss_ellipse_2d(cen_lst[i], cov_lst[i]) plt.plot(x, y, 'k', linewidth=0.5) cen_lst, cov_lst, p_k, logL = gmm.em_gm(dataset, K=9, max_iter=400, verbose=True) print "Log likelihood (how well the data fits the model) = ", logL self_centroids = [ np.array([77, 75]), np.array([335, 75]), np.array([693, 75]), np.array([77, 350]), np.array([335, 350]), np.array([693, 350]), np.array([77, 741]), np.array([335, 741]), np.array([693, 741]) ]
and {} components: aic = {} bic = {} likelihood = {}""".format(c, n, mGMM.aic(dataset), mGMM.bic(dataset), mGMM.score_samples(dataset)[0].sum())) ############## best: 12 components and full covariance import pypr.clustering.gmm as gmm def iter_plot(cen_lst, cov_lst, itr): # For plotting EM progress if itr % 2 == 0: for i in range(len(cen_lst)): x,y = gmm.gauss_ellipse_2d(cen_lst[i], cov_lst[i]) plt.plot(x, y, 'k', linewidth=0.5) cen_lst, cov_lst, p_k, logL = gmm.em_gm(dataset, K = 9, max_iter = 400,verbose = True) print "Log likelihood (how well the data fits the model) = ", logL self_centroids = [np.array([77, 75]), np.array([335, 75]), np.array([693, 75]), np.array([77, 350]), np.array([335, 350]), np.array([693, 350]), np.array([77, 741]), np.array([335,741]), np.array([693, 741])] cen_lst, cov_lst, p_k, logL = gmm.em_gm(dataset, K = 9, max_iter = 400,verbose = True, init_kw={'cluster_init':'custom', 'cntr_lst':self_centroids}) print "Log likelihood (how well the data fits the model) = ", logL
ccov = [ array([[1,0.4],[0.4,1]]), diag((1,2)), diag((0.4,0.1)) ] # Covariance matrices T = gmm.sample_gaussian_mixture(centroids, ccov, mc, samples=500) V = gmm.sample_gaussian_mixture(centroids, ccov, mc, samples=500) plot(T[:,0], T[:,1], '.') # Expectation-Maximization of Mixture of Gaussians Krange = range(1, 20 + 1); runs = 1 meanLogL_train = np.zeros((len(Krange), runs)) meanLogL_valid = np.zeros((len(Krange), runs)) for K in Krange: print "Clustering for K = ", K; sys.stdout.flush() for r in range(runs): cen_lst, cov_lst, p_k, logL = gmm.em_gm(T, K = K, iter = 100) meanLogL_train[K-1, r] = logL meanLogL_valid[K-1, r] = gmm.gm_log_likelihood(V, cen_lst, cov_lst, p_k) fig1 = figure() subplot(1, 2, 1) for r in range(runs): plot(Krange, meanLogL_train[:, r], 'g:', label='Training') plot(Krange, meanLogL_valid[:, r], 'b-', label='Validation') legend(loc='lower right') xlabel('Number of clusters') ylabel('log likelihood') bic = np.zeros(len(Krange)) # We should train with ALL data here X = np.concatenate((T,V), axis = 0)
x, y = gmm.gauss_ellipse_2d(cen_lst[i], cov_lst[i]) plot(x, y, 'k', linewidth=0.5) seed(1) mc = [0.4, 0.4, 0.2] # Mixing coefficients centroids = [array([0, 0]), array([3, 3]), array([0, 4])] ccov = [array([[1, 0.4], [0.4, 1]]), diag((1, 2)), diag((0.4, 0.1))] # Generate samples from the gaussian mixture model X = gmm.sample_gaussian_mixture(centroids, ccov, mc, samples=1000) fig1 = figure() plot(X[:, 0], X[:, 1], '.') # Expectation-Maximization of Mixture of Gaussians cen_lst, cov_lst, p_k, logL = gmm.em_gm(X, K = 3, iter = 400,\ verbose = True, iter_call = iter_plot) print "Log likelihood (how well the data fits the model) = ", logL # Plot the cluster ellipses for i in range(len(cen_lst)): x1, x2 = gmm.gauss_ellipse_2d(cen_lst[i], cov_lst[i]) plot(x1, x2, 'k', linewidth=2) title("") xlabel(r'$x_1$') ylabel(r'$x_2$') # Now we will find the conditional distribution of x given y fig2 = figure() ax1 = subplot(111) plot(X[:, 0], X[:, 1], ',') y = -1.0
def EM_wall_mixture_model(points, K=3, MAX_ITER=30, init_from_gmm=True, force_connected=True, debug_output=None): """ initialization """ N = points.shape[0] # init lines L = [] if init_from_gmm: cen_lst, cov_lst, p_k, logL = gmm.em_gm(points, K=K) for cen, cov in zip(cen_lst, cov_lst): L.append(Line.fromEllipse(cen, cov)) else: for k in range(K): L.append(Line.getRandom(points)) # init pi pi_k = 1. / K * ones(K) """ run algorithm """ oldgamma = zeros((N, K)) for iter_num in range(MAX_ITER): """ E-step """ pi_times_prob = zeros((N, K)) gamma = zeros((N, K)) for n in range(N): for k in range(K): pi_times_prob[n, k] = pi_k[k] * L[k].Prob(points[n]) # normalized version of pi_times_prob is gamma gamma[n, :] = pi_times_prob[n, :] / pi_times_prob[n, :].sum() """ debug output """ if debug_output: debug_output(L, gamma, iter_num) """ M-step (general) """ N_k = gamma.sum(0) pi_k = N_k / N_k.sum() """ M-step (gaussian mixture with inf primary eigenval) """ eOptimizer = EOptimizer(L, gamma, pi_k, N_k, points, pi_times_prob) for k in range(K): """ get mean """ mu = 1 / N_k[k] * sum(gamma[n, k] * points[n] for n in range(N)) """ get cov matrix """ x = [array([points[n] - mu]).T for n in range(N)] cov = 1 / N_k[k] * sum(gamma[n, k] * x[n] * x[n].T for n in range(N)) # re-initilize M, alpha and sigma from ellipse L[k].updateFromEllipse(mu, cov) """ get e """ eOptimizer.setK(k) L[k].e = eOptimizer.optimize() """ force that all lines are connected """ for l in L: l.connectToNearestLines(L) """ check sanity of lines """ for k in range(K): if L[k].inBadCondition(): #print "L[%d] in bad condition" % k L[k] = Line.getRandom(points) for kk in range(k): if L[k].almostEqualTo(L[kk]): #print "L[%d] almost equal to L[%d]" % (k, kk) L[k] = Line.getRandom(points) break """ remove 2 lines that are on same real line """ DTHETA = 20 * pi / 180 # margin for k, l in enumerate(L): for kk, line in ([kk, ll] for kk, ll in enumerate(L) if k != kk): if l.isConnectedTo(line) or l.isAlmostConnectedTo(line, L): theta = l.getAngleWith(line) if pi / 2 - abs(theta - pi / 2) < DTHETA: # remove line with least responsibility del_k = k if N_k[kk] > N_k[k] else kk L[del_k] = Line.getRandom(points) """ remove crossing lines """ for k, l in enumerate(L): for kk, line in ([kk, ll] for kk, ll in enumerate(L) if k != kk): # if l and line are connected, they can't cross if l.isConnectedTo(line): continue X = l.getIntersectionWith(line) # if X lies on line and l, this is an intersection if line.hasPoint(X) and l.hasPoint(X): # remove line with least responsibility del_k = k if N_k[kk] > N_k[k] else kk L[del_k] = Line.getRandom(points) """ check stop cond """ if norm(oldgamma - gamma) < .05: break oldgamma = gamma """ debug output """ if debug_output: debug_output(L, gamma) """ calc probablility P[{Lk} | X] ~ P[X | {Lk}] * P[{Lk}] """ totalLogProb = sum(log(pi_times_prob.sum(1))) logProbLk = 0 anglecounter = 0 ## add prob of theta ~ N(pi/2, sigma_theta) sigma_theta = .01 * pi / 2 for k, l in enumerate(L): lines = l.getAllConnectedLinesFrom(L[k + 1:]) for line in lines: theta = l.getAngleWith(line) dtheta = abs(theta - pi / 2) logProbLk += -dtheta**2 / (2 * sigma_theta**2) - log( sqrt(2 * pi) * sigma_theta) anglecounter += 1 logProbLk /= anglecounter if anglecounter else 1 ## in case of crossing: Prob = 0 for k, l in enumerate(L): for line in (ll for kk, ll in enumerate(L) if k != kk): X = l.getIntersectionWith(line) # if l and line are connected, no extra prob is needed if l.isConnectedTo(line): continue # if X lies on line and l, this is an intersection if line.hasPoint(X) and l.hasPoint(X): logProbLk += float('-inf') break ## add prob for unattached but near line (extension has to cross) ## ~ N(THRESHOLD_E/d | 0, sigma_unatt) * N(theta | pi/2, sigma_theta) sigma_unatt = 0.05 for k, l in enumerate(L): for line in (ll for kk, ll in enumerate(L) if k != kk): # if l and line are connected, no extra prob is needed if l.isConnectedTo(line): continue for E in line.E(): # if E is near l, add probabilities as described above d = l.diff(E) + .001 # d should never be zero if d < THRESHOLD_E: logProbLk += -(THRESHOLD_E / d)**2 / (2 * sigma_unatt**2) theta = l.getAngleWith(line) logProbLk += -(theta - pi / 2)**2 / (2 * sigma_theta**2) break logProbLkX = totalLogProb + logProbLk return L, logProbLkX
""" wall mixture model """ from points import points, N from wmmlib import * from pylab import * import pylab as pl from copy import copy import time import pypr.clustering.gmm as gmm # download from http://pypr.sourceforge.net/ import argparse """ settings """ K = 4 print N """ plot function """ gmm_cen_lst, gmm_cov_lst, p_k, logL = gmm.em_gm(points, K=K) def debug_output(L, gamma=None, iter_num=None): """ check final """ if gamma != None: final = iter_num == None if final: ioff() #else: return """ text output """ if gamma != None: print "iteration %d..." % iter_num if not final else "\nFINAL" """ init vars """ COLORS = "bykgrcm" if gamma == None: gamma = zeros((N, K)) """ plot init """ clf()
def plotData(X, c): plot(X[:,0], X[:,1],'.', c = c, alpha = 0.2) # create data #seed(1) fig1 = figure() X = generateData(70) plotData(X, 'b') cen_lst_1, cov_lst_1, p_k_1, logL_1 = gmm.em_gm(X, K = 3, max_iter = 400, \ verbose = True, iter_call = None) plotCenters(cen_lst_1, cov_lst_1, 'b', 0) X = generateData(70) plotData(X, 'y') cen_lst_2, cov_lst_2, p_k_2, logL_2 = gmm.em_gm(X, K = 3, max_iter = 400, \ verbose = True, iter_call = None) plotCenters(cen_lst_2, cov_lst_2, 'y', 3) # get conditional distribution #y = 0.3 #(con_cen, con_cov, new_p_k) = gmm.cond_dist(np.array([y,np.nan]), cen_lst, cov_lst, p_k)
sigma_scl = 0.1 X = np.zeros((samples_pr_cluster * K_orig, D)) for k in range(K_orig): mu = np.random.randn(D) sigma = np.eye(D) * sigma_scl cen_lst.append(mu) cov_lst.append(sigma) mc = np.ones(K_orig) / K_orig # All clusters equally probable # Sample from the mixture: N = 1000 X = gmm.sample_gaussian_mixture(cen_lst, cov_lst, mc, samples=N) K_range = range(2, 10) runs = 10 bic_table = np.zeros((len(K_range), runs)) for K_idx, K in enumerate(K_range): print "Clustering for K=%d" % K for i in range(runs): cluster_init_kw = {"cluster_init": "sample", "max_init_iter": 5, "cov_init": "var", "verbose": True} cen_lst, cov_lst, p_k, logL = gmm.em_gm( X, K=K, max_iter=1000, delta_stop=1e-2, init_kw=cluster_init_kw, verbose=True, max_tries=10 ) bic = stattest.bic_gmm(logL, N, D, K) bic_table[K_idx, i] = bic plot(K_range, bic_table) xlabel("K") ylabel("BIC score") title("True K=%d, dim=%d") % (K_orig, D)
def EM_wall_mixture_model(points, K = 3, MAX_ITER = 30, init_from_gmm=True, force_connected=True, debug_output = None): """ initialization """ N = points.shape[0] # init lines L = [] if init_from_gmm: cen_lst, cov_lst, p_k, logL = gmm.em_gm(points, K = K) for cen, cov in zip(cen_lst, cov_lst): L.append(Line.fromEllipse(cen, cov)) else: for k in range(K): L.append(Line.getRandom(points)) # init pi pi_k = 1./K * ones(K) """ run algorithm """ oldgamma = zeros((N, K)) for iter_num in range(MAX_ITER): """ E-step """ pi_times_prob = zeros((N, K)) gamma = zeros((N, K)) for n in range(N): for k in range(K): pi_times_prob[n,k] = pi_k[k] * L[k].Prob(points[n]) # normalized version of pi_times_prob is gamma gamma[n,:] = pi_times_prob[n,:] / pi_times_prob[n,:].sum() """ debug output """ if debug_output: debug_output(L, gamma, iter_num) """ M-step (general) """ N_k = gamma.sum(0) pi_k = N_k / N_k.sum() """ M-step (gaussian mixture with inf primary eigenval) """ eOptimizer = EOptimizer(L, gamma, pi_k, N_k, points, pi_times_prob) for k in range(K): """ get mean """ mu = 1/N_k[k] * sum(gamma[n,k]*points[n] for n in range(N)) """ get cov matrix """ x = [array([points[n]-mu]).T for n in range(N)] cov = 1/N_k[k] * sum(gamma[n,k]*x[n]*x[n].T for n in range(N)) # re-initilize M, alpha and sigma from ellipse L[k].updateFromEllipse(mu, cov) """ get e """ eOptimizer.setK(k) L[k].e = eOptimizer.optimize() """ force that all lines are connected """ for l in L: l.connectToNearestLines(L) """ check sanity of lines """ for k in range(K): if L[k].inBadCondition(): #print "L[%d] in bad condition" % k L[k] = Line.getRandom(points) for kk in range(k): if L[k].almostEqualTo(L[kk]): #print "L[%d] almost equal to L[%d]" % (k, kk) L[k] = Line.getRandom(points) break """ remove 2 lines that are on same real line """ DTHETA = 20 * pi/180 # margin for k, l in enumerate(L): for kk, line in ([kk, ll] for kk, ll in enumerate(L) if k != kk): if l.isConnectedTo(line) or l.isAlmostConnectedTo(line, L): theta = l.getAngleWith(line) if pi/2 - abs(theta - pi/2) < DTHETA: # remove line with least responsibility del_k = k if N_k[kk] > N_k[k] else kk L[del_k] = Line.getRandom(points) """ remove crossing lines """ for k, l in enumerate(L): for kk, line in ([kk, ll] for kk, ll in enumerate(L) if k != kk): # if l and line are connected, they can't cross if l.isConnectedTo(line): continue X = l.getIntersectionWith(line) # if X lies on line and l, this is an intersection if line.hasPoint(X) and l.hasPoint(X): # remove line with least responsibility del_k = k if N_k[kk] > N_k[k] else kk L[del_k] = Line.getRandom(points) """ check stop cond """ if norm(oldgamma - gamma) < .05: break oldgamma = gamma """ debug output """ if debug_output: debug_output(L, gamma) """ calc probablility P[{Lk} | X] ~ P[X | {Lk}] * P[{Lk}] """ totalLogProb = sum(log(pi_times_prob.sum(1))) logProbLk = 0 anglecounter = 0 ## add prob of theta ~ N(pi/2, sigma_theta) sigma_theta = .01 * pi/2 for k, l in enumerate(L): lines = l.getAllConnectedLinesFrom(L[k+1:]) for line in lines: theta = l.getAngleWith(line) dtheta = abs(theta - pi/2) logProbLk += - dtheta**2 / (2*sigma_theta**2) - log(sqrt(2*pi)*sigma_theta) anglecounter += 1 logProbLk /= anglecounter if anglecounter else 1 ## in case of crossing: Prob = 0 for k, l in enumerate(L): for line in (ll for kk, ll in enumerate(L) if k != kk): X = l.getIntersectionWith(line) # if l and line are connected, no extra prob is needed if l.isConnectedTo(line): continue # if X lies on line and l, this is an intersection if line.hasPoint(X) and l.hasPoint(X): logProbLk += float('-inf') break ## add prob for unattached but near line (extension has to cross) ## ~ N(THRESHOLD_E/d | 0, sigma_unatt) * N(theta | pi/2, sigma_theta) sigma_unatt = 0.05 for k, l in enumerate(L): for line in (ll for kk, ll in enumerate(L) if k != kk): # if l and line are connected, no extra prob is needed if l.isConnectedTo(line): continue for E in line.E(): # if E is near l, add probabilities as described above d = l.diff(E) + .001 # d should never be zero if d < THRESHOLD_E: logProbLk += - (THRESHOLD_E / d)**2 / (2 * sigma_unatt**2) theta = l.getAngleWith(line) logProbLk += - (theta - pi/2)**2 / (2*sigma_theta**2) break logProbLkX = totalLogProb + logProbLk return L, logProbLkX
from util.common import metric df = pd.read_csv( '/home/tadeze/projects/missingvalue/datasets/anomaly/yeast/fullsamples/yeast_1.csv' ) train_data = df.ix[:, 1:].as_matrix().astype(np.float64) # train_lbl = df.ix[:,0] # train_lbl = map(int, df.ix[:, 0] == "anomaly") gmms = GaussianMixture(n_components=3) gmms.fit(train_data) score = -gmms.score_samples(train_data) print gmms.get_params(False) print len(score) print metric(train_lbl, score) from pypr.clustering import gmm cen_lst, cov_lst, p_k, logL = gmm.em_gm(train_data, max_iter=100, K=3) score = [ -gmm.gm_log_likelihood( train_data[i, :], center_list=cen_lst, cov_list=cov_lst, p_k=p_k) for i in range(0, train_data.shape[0]) ] #print score print metric(train_lbl, score) # Marginalize the #m_cen_lst, m_cov_lst, m_p_k featur_inc = [0, 4] marginalize = gmm.marg_dist(featur_inc, cen_lst, cov_lst, p_k) score2 = score = [ -gmm.gm_log_likelihood(train_data[i, featur_inc], center_list=marginalize[0],
seed(1) mc = [0.4, 0.4, 0.2] # Mixing coefficients centroids = [array([0, 0]), array([3, 3]), array([0, 4])] ccov = [array([[1, 0.4], [0.4, 1]]), diag((1, 2)), diag((0.4, 0.1))] # Generate samples from the gaussian mixture model X = gmm.sample_gaussian_mixture(centroids, ccov, mc, samples=1000) fig1 = figure() plot(X[:, 0], X[:, 1], '.') # Expectation-Maximization of Mixture of Gaussians cen_lst, cov_lst, p_k, logL = gmm.em_gm(X, K=3, max_iter=400, verbose=True, iter_call=None) print "Log likelihood (how well the data fits the model) = ", logL # Plot the cluster ellipses for i in range(len(cen_lst)): x1, x2 = gmm.gauss_ellipse_2d(cen_lst[i], cov_lst[i]) plot(x1, x2, 'k', linewidth=2) title("") xlabel(r'$x_1$') ylabel(r'$x_2$') # Now we will find the conditional distribution of x given y fig2 = figure() ax1 = subplot(111)
""" wall mixture model """ from points import points, N from wmmlib import * from pylab import * import pylab as pl from copy import copy import time import pypr.clustering.gmm as gmm # download from http://pypr.sourceforge.net/ import argparse """ settings """ K = 4 print N """ plot function """ gmm_cen_lst, gmm_cov_lst, p_k, logL = gmm.em_gm(points, K=K) def debug_output(L, gamma=None, iter_num=None): """ check final """ if gamma != None: final = iter_num == None if final: ioff() # else: return """ text output """ if gamma != None: print "iteration %d..." % iter_num if not final else "\nFINAL"
sigma_scl = 0.1 X = np.zeros((samples_pr_cluster * K_orig, D)) for k in range(K_orig): mu = np.random.randn(D) sigma = np.eye(D) * sigma_scl cen_lst.append(mu) cov_lst.append(sigma) mc = np.ones(K_orig) / K_orig # All clusters equally probable # Sample from the mixture: N = 1000 X = gmm.sample_gaussian_mixture(cen_lst, cov_lst, mc, samples=N) K_range = list(range(2, 10)) runs = 10 bic_table = np.zeros((len(K_range), runs)) for K_idx, K in enumerate(K_range): print("Clustering for K=%d" % K) for i in range(runs): cluster_init_kw = {'cluster_init':'sample', 'max_init_iter':5, \ 'cov_init':'var', 'verbose':True} cen_lst, cov_lst, p_k, logL = gmm.em_gm(X, K = K, max_iter = 1000, \ delta_stop=1e-2, init_kw=cluster_init_kw, verbose=True, max_tries=10) bic = stattest.bic_gmm(logL, N, D, K) bic_table[K_idx, i] = bic plot(K_range, bic_table) xlabel('K') ylabel('BIC score') title('True K=%d, dim=%d') % (K_orig, D)
for i in range(len(cen_lst)): x,y = gmm.gauss_ellipse_2d(cen_lst[i], cov_lst[i]) plot(x, y, 'k', linewidth=0.5) seed(1) mc = [0.4, 0.4, 0.2] # Mixing coefficients centroids = [ array([0,0]), array([3,3]), array([0,4]) ] ccov = [ array([[1,0.4],[0.4,1]]), diag((1,2)), diag((0.4,0.1)) ] # Generate samples from the gaussian mixture model X = gmm.sample_gaussian_mixture(centroids, ccov, mc, samples=1000) fig1 = figure() plot(X[:,0], X[:,1], '.') # Expectation-Maximization of Mixture of Gaussians cen_lst, cov_lst, p_k, logL = gmm.em_gm(X, K = 3, max_iter = 400,\ verbose = True, iter_call = iter_plot) print "Log likelihood (how well the data fits the model) = ", logL # Plot the cluster ellipses for i in range(len(cen_lst)): x1,x2 = gmm.gauss_ellipse_2d(cen_lst[i], cov_lst[i]) plot(x1, x2, 'k', linewidth=2) title(""); xlabel(r'$x_1$'); ylabel(r'$x_2$') # Now we will find the conditional distribution of x given y fig2 = figure() ax1 = subplot(111) plot(X[:,0], X[:,1], ',') y = -1.0 axhline(y) x1plt = np.linspace(axis()[0], axis()[1], 200)