def estimate(self, data, Labels=None, maxiter = 300, delta = 0.001, ninit=1): """ Estimation of the GMM based on data and an EM algorithm Parameters ---------- data : (n*p) feature array, n = nb items, p=feature dimension Labels=None : prior labelling of the data (this may improve convergence) maxiter=300 : max number of iterations of the EM algorithm delta = 0.001 : criterion on the log-likelihood increments to declare converegence ninit=1 : number of possible iterations of the GMM estimation Returns ------- Labels : array of shape(n), type np.int: discrete labelling of the data items into clusters LL : (float) average log-likelihood of the data bic : (float) associated bic criterion """ data = self.check_x(data) if Labels==None: Labels = np.zeros(data.shape[0],np.int) nit = 10 C,Labels,J = fc.kmeans(data,self.k,Labels,nit) if (self.k>data.shape[0]-1): print "too many clusters" self.k = data.shape[0]-1 if self.prec_type=='full':prec_type=0 if self.prec_type=='diag': prec_type=1 C, P, W, Labels, bll = fc.gmm(data,self.k,Labels,prec_type, maxiter,delta) self.means = C if self.prec_type=='diag': self.precisions = P if self.prec_type=='full': self.precisions = np.reshape(P,(self.k,self.dim,self.dim)) self.weights = W self.check() for i in range(ninit-1): Labels = np.zeros(data.shape[0]) C, P, W, labels, ll = fc.gmm(data,self.k,Labels, prec_type,maxiter,delta) if ll>bll: self.means = C if self.prec_type=='diag': self.precisions = P if self.prec_type=='full': self.precisions = np.reshape(P,(self.k,self.dim,self.dim)) self.weights = W self.check() bll = ll Labels = labels return Labels,bll, self.bic_from_all (bll,data.shape[0])
def testgmm3(self): X = nr.randn(10000,2) A = np.concatenate([np.ones((7000,2)),np.zeros((3000,2))]) X = X+3*A L = np.concatenate([np.ones(5000), np.zeros(5000)]).astype(np.int) C,P,W,L,J = fc.gmm(X,2,L,2) C,P,W,L,J = fc.gmm(X,2,L,1) C,P,W,L,J = fc.gmm(X,2,L,0) l = L[:7000].astype('d') self.assert_(np.mean(l)>0.9)
def testgmm2(self): X = nr.randn(10000,2) A = np.concatenate([np.ones((7000,2)),np.zeros((3000,2))]) X = X+3*A C,P,W,L,J = fc.gmm(X,2) dW= W[0]-W[1] ndW = dW*dW self.assert_(ndW > 0.1)
def testgmm1_andCheckResult(self): np.random.seed(0) # force the random sequence X = nr.randn(10000,2) A = np.concatenate([np.ones((7000,2)),np.zeros((3000,2))]) X = X+3*A L = np.concatenate([np.ones(5000), np.zeros(5000)]).astype(np.int) C,P,W,L,J = fc.gmm(X,2,L); np.random.seed(None) # re-randomize the seed # results for randomseed = 0 expectC = np.concatenate([np.zeros((1,2)),3*np.ones((1,2))]) expectP = np.ones((2,2)) expectW = np.array([ 0.3, 0.7]) expectSL = 7000; self.assert_( np.allclose(C, expectC,0.01,0.05 )) self.assert_( np.allclose(P, expectP,0.03,0.05)) self.assert_( np.allclose(W, expectW ,0.03,0.05)) self.assert_( np.allclose(expectSL, L.sum(),0.01))