def sample(self):
     self._check_is_valid_density()
     sample = sample_vMF(
             mu=self.get_normalized_mean(),
             kappa=self.get_concentration(),
             num_samples=1)[0]
     return sample
def test_maximization():
    num_points = 5000
    n_features = 500
    posterior = np.ones((1, num_points))

    kappas = [5000, 8000, 16400]
    for kappa in kappas:
        mu = np.random.randn(n_features)
        mu /= np.linalg.norm(mu)

        X = sample_vMF(mu, kappa, num_points)

        centers, weights, concentrations = (
            von_mises_fisher_mixture._maximization(X, posterior))

        print('center estimate error', np.linalg.norm(centers[0, :] - mu))
        print('kappa estimate',
              np.abs(kappa - concentrations[0]) / kappa, kappa,
              concentrations[0])

        assert_almost_equal(1., weights[0])
        assert_almost_equal(0.0,
                            np.abs(kappa - concentrations[0]) / kappa,
                            decimal=2)
        assert_almost_equal(0.0, np.linalg.norm(centers[0, :] - mu), decimal=2)
示例#3
0
def sample_mix_vMF(center, kappa, weight, num_doc):
    distrib_idx = np.random.choice(range(len(center)), num_doc, p=weight)
    samples = []
    for idx in distrib_idx:
        samples.append(sample_vMF(center[idx], kappa[idx], 1))
    samples = np.array(samples)
    samples = np.reshape(samples, (num_doc, -1))
    return samples
示例#4
0
def generateDataset(mu_s, kappa_s, num_samples):
    num_clusters, dim = mu_s.shape

    X_s_numpy = np.zeros((num_clusters, num_samples, dim))
    Y_s_numpy = np.zeros((num_clusters, num_samples))
    for index in range(num_clusters):
        X_s_numpy[index] = sample_vMF(mu_s[index], kappa_s[index], num_samples)
        Y_s_numpy[index] = index

    return X_s_numpy, Y_s_numpy
    def sample(self):
        self._check_is_valid_density()

        alpha = self.natural_parameters['alpha_minus_one'] + 1.0
        beta = self.natural_parameters['beta']
        sample_concentration = scipy.stats.gamma.rvs(
                a=alpha, scale=1.0/beta, size=1)[0]

        sample_mean = sample_vMF(
                mu=self.get_normalized_mean(),
                kappa=sample_concentration,
                num_samples=1)[0]

        return dict(
                mean=sample_mean,
                concentration=sample_concentration,
                )
def create_data(seed, vmeans, vkappas, num_per_class=500):
    np.random.seed(seed)
    data = [[], []]
    for view in range(2):
        for comp in range(len(vmeans[0])):
            comp_samples = sample_vMF(vmeans[view][comp], vkappas[view][comp],
                                      num_per_class)
            data[view].append(comp_samples)
    for view in range(2):
        data[view] = np.vstack(data[view])

    labels = list()
    for ind in range(len(vmeans[0])):
        labels.append(ind * np.ones(num_per_class, ))

    labels = np.concatenate(labels)

    return data, labels
示例#7
0
 def sample_observation(self, parameters):
     """ Sample observations
     Args:
         parameters (Map):
             normalized_mean (ndarray): lenth num_dim, l2 norm = 1
             concentration (double): positive
     Returns:
         obs (ndarray)
     """
     from spherecluster import sample_vMF
     if not np.isclose(np.linalg.norm(parameters.normalized_mean), 1.0):
         raise ValueError("normalized_mean must have l2 norm = 1.0")
     if parameters.concentration < 1e-16:
         raise ValueError("concentration must be positive")
     obs = sample_vMF(mu=parameters.normalized_mean,
                      kappa=parameters.concentration,
                      num_samples=1)[0]
     return obs
示例#8
0
def pseudodocs(word_sup_array, total_num, background_array, sequence_length, len_avg,
				len_std, num_doc, interp_weight, vocabulary_inv, embedding_mat, centers, kappa, model, save_dir=None):
	
	for i in range(len(embedding_mat)):
		embedding_mat[i] = embedding_mat[i] / np.linalg.norm(embedding_mat[i])

	# _, centers, kappas = \
	# label_expansion(word_sup_array, save_dir, vocabulary_inv, embedding_mat)

	print("Pseudo documents generation...")
	background_vec = interp_weight * background_array
	if model == 'cnn':
		docs = np.zeros((num_doc*len(word_sup_array), sequence_length), dtype='int32')
		label = np.zeros((num_doc*len(word_sup_array), len(word_sup_array)))
		for i in range(len(word_sup_array)):
			docs_len = len_avg*np.ones(num_doc)
			center = centers[i]
			# kappa = kappas[i]
			discourses = sample_vMF(center, kappa, num_doc)
			for j in range(num_doc):
				discourse = discourses[j]
				prob_vec = np.dot(embedding_mat, discourse)
				prob_vec = np.exp(prob_vec)
				sorted_idx = np.argsort(prob_vec)[::-1]
				delete_idx = sorted_idx[total_num:]
				prob_vec[delete_idx] = 0
				prob_vec /= np.sum(prob_vec)
				prob_vec *= 1 - interp_weight
				prob_vec += background_vec
				doc_len = int(docs_len[j])
				docs[i*num_doc+j][:doc_len] = np.random.choice(len(prob_vec), size=doc_len, p=prob_vec)
				label[i*num_doc+j] = interp_weight/len(word_sup_array)*np.ones(len(word_sup_array))
				label[i*num_doc+j][i] += 1 - interp_weight
	elif model == 'rnn':
		docs = np.zeros((num_doc*len(word_sup_array), sequence_length[0], sequence_length[1]), dtype='int32')
		label = np.zeros((num_doc*len(word_sup_array), len(word_sup_array)))
		doc_len = int(len_avg[0])
		sent_len = int(len_avg[1])
		for period_idx in vocabulary_inv:
			if vocabulary_inv[period_idx] == '.':
				break
		for i in range(len(word_sup_array)):
			center = centers[i]
			# kappa = kappas[i]
			discourses = sample_vMF(center, kappa, num_doc)
			for j in range(num_doc):
				discourse = discourses[j]
				prob_vec = np.dot(embedding_mat, discourse)
				prob_vec = np.exp(prob_vec)
				sorted_idx = np.argsort(prob_vec)[::-1]
				delete_idx = sorted_idx[total_num:]
				prob_vec[delete_idx] = 0
				prob_vec /= np.sum(prob_vec)
				prob_vec *= 1 - interp_weight
				prob_vec += background_vec
				for k in range(doc_len):
					docs[i*num_doc+j][k][:sent_len] = np.random.choice(len(prob_vec), size=sent_len, p=prob_vec)
					docs[i*num_doc+j][k][sent_len] = period_idx
				label[i*num_doc+j] = interp_weight/len(word_sup_array)*np.ones(len(word_sup_array))
				label[i*num_doc+j][i] += 1 - interp_weight

	print("Finished Pseudo documents generation.")
	return docs, label
示例#9
0
def bow_pseudodocs(relevant_nodes,
                   expand_num,
                   background_array,
                   sequence_length,
                   len_avg,
                   len_std,
                   num_doc,
                   interp_weight,
                   vocabulary_inv,
                   embedding_mat,
                   class_emb,
                   kappa,
                   save_dir=None,
                   total_num=50):
    n_classes = len(relevant_nodes)

    for i in range(len(embedding_mat)):
        embedding_mat[i] = embedding_mat[i] / np.linalg.norm(embedding_mat[i])

    background_vec = interp_weight * background_array
    docs = np.zeros((num_doc * n_classes, sequence_length), dtype='int32')
    label = np.zeros((num_doc * n_classes, n_classes))

    for i in range(n_classes):
        docs_len = len_avg * np.ones(num_doc)

        relevant_node = relevant_nodes[i]
        if relevant_node.children:
            children_nodes = relevant_node.children
            child_doc = num_doc // len(children_nodes)
            rm_doc = num_doc % len(children_nodes)

            children_node = children_nodes[0]
            center = class_emb[children_node.name]
            discourses = sample_vMF(center, kappa, rm_doc)

            for children_node in children_nodes:
                center = class_emb[children_node.name]
                discourses_child = sample_vMF(center, kappa, child_doc)
                discourses = np.concatenate((discourses, discourses_child),
                                            axis=0)
        else:
            center = class_emb[relevant_node.name]
            discourses = sample_vMF(center, kappa, num_doc)

        for j in range(num_doc):
            discourse = discourses[j]
            prob_vec = np.dot(embedding_mat, discourse)
            prob_vec = np.exp(prob_vec)
            sorted_idx = np.argsort(-prob_vec)
            delete_idx = sorted_idx[total_num:]
            prob_vec[delete_idx] = 0
            prob_vec /= np.sum(prob_vec)
            prob_vec *= 1 - interp_weight
            prob_vec += background_vec
            doc_len = int(docs_len[j])
            docs[i * num_doc + j][:doc_len] = np.random.choice(len(prob_vec),
                                                               size=doc_len,
                                                               p=prob_vec)
            label[i * num_doc +
                  j] = interp_weight / n_classes * np.ones(n_classes)
            label[i * num_doc + j][i] += 1 - interp_weight

    return docs, label
示例#10
0
    return raw_input(val)


###############################################################################
# Generate small-mix dataset
mu_0 = np.array([-0.251, -0.968, -0.105])
mu_0 = mu_0 / np.linalg.norm(mu_0)
mu_1 = np.array([0.399, 0.917, 0.713])
mu_1 = mu_1 / np.linalg.norm(mu_1)
mus = [mu_0, mu_1]
kappa_0 = 8  # concentration parameter
kappa_1 = 2  # concentration parameter
kappas = [kappa_0, kappa_1]
num_points_per_class = 300

X_0 = sample_vMF(mu_0, kappa_0, num_points_per_class)
X_1 = sample_vMF(mu_1, kappa_1, num_points_per_class)
X = np.zeros((2 * num_points_per_class, 3))
X[:num_points_per_class, :] = X_0
X[num_points_per_class:, :] = X_1
labels = np.zeros((2 * num_points_per_class, ))
labels[num_points_per_class:] = 1

###############################################################################
# K-Means clustering
km = KMeans(n_clusters=2, init='k-means++', n_init=20)
km.fit(X)

cdists = []
for center in km.cluster_centers_:
    cdists.append(np.linalg.norm(mus[0] - center))
示例#11
0
import seaborn  # NOQA

from spherecluster import sample_vMF

plt.ion()

n_clusters = 3
mus = np.random.randn(3, n_clusters)
mus, r = np.linalg.qr(mus, mode='reduced')

kappas = [15, 15, 15]
num_points_per_class = 250

Xs = []
for nn in range(n_clusters):
    new_X = sample_vMF(mus[nn], kappas[nn], num_points_per_class)
    Xs.append(new_X.T)

fig = plt.figure(figsize=(8, 6))
ax = fig.add_subplot(1,
                     1,
                     1,
                     aspect='equal',
                     projection='3d',
                     adjustable='box-forced',
                     xlim=[-1.1, 1.1],
                     ylim=[-1.1, 1.1],
                     zlim=[-1.1, 1.1])

colors = ['b', 'r', 'g']
for nn in range(n_clusters):