def sample_cond_dist(self, Y, n_samples): """ Returns conditional samples from the gaussian mixture model. Keyword arguments: Y -- A numpy vector of the same length as the input data samples with either values or np.nan. A two dimensional input array could be np.array([3, np.nan]). The gmm would be sampled with a fixed first dimension of 3. n_samples -- Number of requested samples """ # get the conditional distribution (con_means, con_covariances, con_weights) = self.cond_dist(Y) #sample from the conditional distribution samples = pypr_gmm.sample_gaussian_mixture(con_means, con_covariances, con_weights, n_samples) # find the columns where the nans are nan_cols = np.where(np.isnan(Y))[0] # extend the input to the length of the samples full_samples = np.tile(Y, (n_samples, 1)) #copy the sample columns full_samples[:, nan_cols] = samples return full_samples
def generateData(n): mc = [0.4, 0.4, 0.2] # Mixing coefficients centroids = [ array([0,0]), array([3,3]), array([0,4]) ] ccov = [ array([[1,0.4],[0.4,1]]), diag((1,2)), diag((0.4,0.1)) ] # Generate samples from the gaussian mixture model X = gmm.sample_gaussian_mixture(centroids, ccov, mc, samples=n) return X
def sample_cond_dist(self, Y, n_samples): # get the conditional distribution (con_means, con_covariances, con_weights) = self.cond_dist(Y) #sample from the conditional distribution samples = pypr_gmm.sample_gaussian_mixture(con_means, con_covariances, con_weights, n_samples) # find the columns where the nans are nan_cols = np.where(np.isnan(Y))[0] # extend the input to the length of the samples full_samples = np.tile(Y, (n_samples, 1)) #copy the sample columns full_samples[:, nan_cols] = samples return full_samples
def generate_data(n_samples): mc = [0.4, 0.4, 0.2] # Mixing coefficients centroids = [ np.array([0, 0]), np.array([3, 3]), np.array([0, 4]) ] ccov = [ np.array([[1, 0.4], [0.4, 1]]), np.diag((1, 2)), np.diag((0.4, 0.1)) ] # Generate samples from the gaussian mixture model samples = gmm.sample_gaussian_mixture(centroids, ccov, mc, samples=n_samples) xs, ys = samples[:, 0], samples[:, 1] probs = np.zeros([n_samples], dtype=np.float32) for it in range(n_samples): input_ = np.array([xs[it], np.nan]) con_cen, con_cov, new_p_k = gmm.cond_dist(input_, centroids, ccov, mc) prob = gmm.gmm_pdf(ys[it], con_cen, con_cov, new_p_k) probs[it] = prob return xs, ys, probs
cen_lst = [] cov_lst = [] # Generate cluster centers, covariance, and mixing coefficients: sigma_scl = 0.1 X = np.zeros((samples_pr_cluster * K_orig, D)) for k in range(K_orig): mu = np.random.randn(D) sigma = np.eye(D) * sigma_scl cen_lst.append(mu) cov_lst.append(sigma) mc = np.ones(K_orig) / K_orig # All clusters equally probable # Sample from the mixture: N = 1000 X = gmm.sample_gaussian_mixture(cen_lst, cov_lst, mc, samples=N) K_range = list(range(2, 10)) runs = 10 bic_table = np.zeros((len(K_range), runs)) for K_idx, K in enumerate(K_range): print("Clustering for K=%d" % K) for i in range(runs): cluster_init_kw = {'cluster_init':'sample', 'max_init_iter':5, \ 'cov_init':'var', 'verbose':True} cen_lst, cov_lst, p_k, logL = gmm.em_gm(X, K = K, max_iter = 1000, \ delta_stop=1e-2, init_kw=cluster_init_kw, verbose=True, max_tries=10) bic = stattest.bic_gmm(logL, N, D, K) bic_table[K_idx, i] = bic plot(K_range, bic_table)
def generate_samples(self): self.samples = gmm.sample_gaussian_mixture(self.mean, self.var, self.weight, samples=self.n_samples) '''
def sample(self, nsamples=1): ''' produce samples ''' return gmm.sample_gaussian_mixture(self.mu, self.sigma, self.pi, nsamples)
# Drawing samples from a Gaussian Mixture Model from numpy import * from matplotlib.pylab import * import pypr.clustering.gmm as gmm mc = [0.4, 0.4, 0.2] # Mixing coefficients centroids = [ array([0,0]), array([3,3]), array([0,4]) ] ccov = [ array([[1,0.4],[0.4,1]]), diag((1,2)), diag((0.4,0.1)) ] # Covariance matrices X = gmm.sample_gaussian_mixture(centroids, ccov, mc, samples=1000) plot(X[:,0], X[:,1], '.') for i in range(len(mc)): x1, x2 = gmm.gauss_ellipse_2d(centroids[i], ccov[i]) plot(x1, x2, 'k', linewidth=2) xlabel('$x_1$'); ylabel('$x_2$')
# Drawing samples from a Gaussian Mixture Model from numpy import * from matplotlib.pylab import * import pypr.clustering.gmm as gmm import pypr.stattest as stattest seed(10) mc = [0.4, 0.4, 0.2] # Mixing coefficients centroids = [ array([0,0]), array([3,3]), array([0,4]) ] ccov = [ array([[1,0.4],[0.4,1]]), diag((1,2)), diag((0.4,0.1)) ] # Covariance matrices T = gmm.sample_gaussian_mixture(centroids, ccov, mc, samples=500) V = gmm.sample_gaussian_mixture(centroids, ccov, mc, samples=500) plot(T[:,0], T[:,1], '.') # Expectation-Maximization of Mixture of Gaussians Krange = range(1, 20 + 1); runs = 1 meanLogL_train = np.zeros((len(Krange), runs)) meanLogL_valid = np.zeros((len(Krange), runs)) for K in Krange: print "Clustering for K = ", K; sys.stdout.flush() for r in range(runs): cen_lst, cov_lst, p_k, logL = gmm.em_gm(T, K = K, iter = 100) meanLogL_train[K-1, r] = logL meanLogL_valid[K-1, r] = gmm.gm_log_likelihood(V, cen_lst, cov_lst, p_k) fig1 = figure() subplot(1, 2, 1) for r in range(runs):
if __name__ == '__main__': # Generate data from two gaussians alpha = 0.05 epsilon = 0.0 mu = 0.0 mu1 = mu - epsilon mu2 = mu + epsilon sigma1 = 0.05 sigma2 = 0.05 N = 1000 # Sample data from two gaussians X = pygmm.sample_gaussian_mixture([np.array([mu1]), np.array([mu2])], [[[sigma1]], [[sigma2]]], [alpha, 1.-alpha], samples=N)[:, 0] dx = 0.2 deltaX = dx*np.random.randn(N) # Gradient epsilon f1 = lambda x, epsilon: 1./(np.sqrt(2*np.pi)*sigma1)*np.exp(-0.5*(x - mu + epsilon)**2./sigma1**2.) f2 = lambda x, epsilon: 1./(np.sqrt(2*np.pi)*sigma2)*np.exp(-0.5*(x - mu - epsilon)**2./sigma2**2.) h = lambda x, epsilon: alpha*f1(x, epsilon) + (1.-alpha)*f2(x, epsilon) g = lambda x, epsilon: (1.-alpha)*f2(x, epsilon)*(x - mu - epsilon)/sigma2**2. - alpha*f1(x, epsilon)*(x - mu + epsilon)/sigma1**2. par_g_eps = lambda x, epsilon: alpha*f1(x, epsilon)*((x - mu + epsilon)**2. - sigma1**2.)/sigma1**4. + (1. - alpha)*f2(x, epsilon)*((x - mu - epsilon)**2. - sigma2**2.)/sigma2**4. par_g_x = lambda x, epsilon: alpha*f1(x, epsilon)*(x-mu+epsilon)**2./sigma1**4.-alpha*f1(x, epsilon)/sigma1**2. + (1. - alpha)*f2(x, epsilon)/sigma2**2. - (1.-alpha)*f2(x, epsilon)*(x - mu - epsilon)**2./sigma2**4. par_h_x = lambda x, epsilon: -(1.-alpha)*f2(x, epsilon)*(x - mu - epsilon)/sigma2**2. - alpha*f1(x, epsilon)*(x - mu + epsilon)/sigma1**2. par_ll_eps_fct = lambda epsilon, x: g(x, epsilon)/h(x, epsilon) par_ll_eps_sum_fct = lambda epsilon, x: np.sum(np.ma.masked_invalid(par_ll_eps_fct(epsilon, x)))
cen_lst = [] cov_lst = [] # Generate cluster centers, covariance, and mixing coefficients: sigma_scl = 0.1 X = np.zeros((samples_pr_cluster * K_orig, D)) for k in range(K_orig): mu = np.random.randn(D) sigma = np.eye(D) * sigma_scl cen_lst.append(mu) cov_lst.append(sigma) mc = np.ones(K_orig) / K_orig # All clusters equally probable # Sample from the mixture: N = 1000 X = gmm.sample_gaussian_mixture(cen_lst, cov_lst, mc, samples=N) K_range = range(2, 10) runs = 10 bic_table = np.zeros((len(K_range), runs)) for K_idx, K in enumerate(K_range): print "Clustering for K=%d" % K for i in range(runs): cluster_init_kw = {"cluster_init": "sample", "max_init_iter": 5, "cov_init": "var", "verbose": True} cen_lst, cov_lst, p_k, logL = gmm.em_gm( X, K=K, max_iter=1000, delta_stop=1e-2, init_kw=cluster_init_kw, verbose=True, max_tries=10 ) bic = stattest.bic_gmm(logL, N, D, K) bic_table[K_idx, i] = bic plot(K_range, bic_table)
for i in range(len(cen_lst)): x, y = gmm.gauss_ellipse_2d(cen_lst[i], cov_lst[i]) plot(x, y, 'k', linewidth=0.5) seed(1) mc = [0.4, 0.4, 0.2] # Mixing coefficients centroids = [array([0, 0, 0]), array([3, 3, 2]), array([0, 4, 3])] ccov = [ array([[1, 0.4, 0.4], [0.4, 1, 0.4], [0.4, 0.4, 1]]), diag((1, 2, 0.4)), diag((0.4, 0.1, 1)) ] # Generate samples from the gaussian mixture model X = gmm.sample_gaussian_mixture(centroids, ccov, mc, samples=500) fig = figure() ax = fig.add_subplot(111, projection='3d') ax.scatter(X[:500, 0].tolist(), X[:500, 1].tolist(), X[:500, 2].tolist(), alpha=0.4) # Expectation-Maximization of Mixture of Gaussians cen_lst, cov_lst, p_k, logL = gmm.em_gm(X, K=4, max_iter=400, verbose=True, iter_call=None) print "Log likelihood (how well the data fits the model) = ", logL