예제 #1
0
def model_dict(X, Z, K):
    """Run all the models and store their features into one dictionary for plotting.
    Args:
        - X : (N, d) data
        - Z : (N) labels
        - K : Number of clusters
    Returns:
        - models : Dictionary containing the means, covariance matrices (when existing) and labels of the models clusters.
    """
    # Models dictionary
    models = dict()

    # Ground truth
    models["ground truth"] = {
        "mean": np.array([X[Z == k].mean(0) for k in range(len(np.unique(Z)))]),
        "cov": None,
        "labels": Z,
    }

    # Run diagonal EM
    em_diag = EM(K)
    em_diag.fit(X)
    models["diagonal EM"] = {
        "mean": em_diag.mus,
        "cov": np.array([np.diag(em_diag.Ds[k]) for k in range(K)]),
        "labels": em_diag.labels_,
    }

    # Run general EM
    em = GaussianMixture(K)
    em.fit(X)

    # Compute reponsabilities
    gaussians = np.array(
        [
            multivariate_normal.pdf(X, em.means_[k], em.covariances_[k])
            * em.weights_[k]
            for k in range(K)
        ]
    )
    r = gaussians / gaussians.sum(0)
    models["general EM"] = {
        "mean": em.means_,
        "cov": em.covariances_,
        "labels": r.argmax(0),
    }

    # Run K-means
    km = KMeans(K)
    km.fit(X)
    models["K-means"] = {"mean": km.cluster_centers_, "cov": None, "labels": km.labels_}

    return models
예제 #2
0
def run_mnist():
    #  FIXME: running EM on MNIST has the problem that all data collapses to one class
    # This is because the likelihood for that class is slightly higher than all other.
    # Probably has to do with the variance being lower for one, form k-means,
    # and that being more important than closeness to mean for such high dimensional data?
    # Running it with 0 iterations (i.e. on k-means) work fine, then it finds different orientations of the digits.
    data_per_class = 20

    training_data = list(mnist.read("training"))
    dim_x, dim_y = np.shape(training_data[0][1])
    ones = [d[1] for d in training_data if d[0] == 1]
    fours = [d[1] for d in training_data if d[0] == 4]
    fives = [d[1] for d in training_data if d[0] == 5]

    ones = ones[:data_per_class]
    fours = fours[:data_per_class]
    fives = fives[:data_per_class]

    data = np.array(ones + fours + fives).reshape((-1, dim_x * dim_y))
    solver = EM(data=data, num_classes=3, num_nuisances=3)
    split_data, thetas = solver.fit(max_iter=1)

    for c, class_thetas in enumerate(thetas):
        for n, theta in enumerate(class_thetas):
            print(f"Prior: {theta.prior}, Var: {theta.variance}")
            mnist.show(thetas[c][n].mean.reshape(28, 28))