Exemplo n.º 1
0
    def create_prior(dim, mean_prior, mean_str, cov_prior, cov_str):
        """
        Creates a gaussian prior, if cov_prior is a scalar, then creates an isotropic prior scaled to that, if its a matrix
        uses it as covariance
        :param dim: data dimension
        :param mean_prior: if a scalar, will create a vector scaled to that, if its a vector then use it as the prior mean
        :param mean_str: prior mean psuedo count
        :param cov_prior: if a scalar, will create an isotropic covariance scaled to cov_prior, if a matrix will use it as
        the covariance.
        :param cov_str: prior covariance psuedo counts
        :return: DPMMSubClusters.niw_hyperparams prior
        """
        if isinstance(mean_prior, (int, float)):
            prior_mean = np.ones(dim) * mean_prior
        else:
            prior_mean = mean_prior

        if isinstance(cov_prior, (int, float)):
            prior_covariance = np.eye(dim) * cov_prior
        else:
            prior_covariance = cov_prior
        prior = niw(mean_str, prior_mean, dim + cov_str, prior_covariance)
        return prior
Exemplo n.º 2
0
def run_synthetic_data_comparisons(
    D: int,
    K: int,
    N: int,
    var_scale: int,
    alpha: int,
    iters: int,
    burnout: int,
    repeats: int,
):

    results = {
        "method": [],
        "k_mae": [],
        "NMI": [],
        "ARI": [],
        "Time": [],
    }

    i = 0
    while i < repeats:

        # generate dataset
        data, labels = DPMMPython.generate_gaussian_data(N, D, K, var_scale)

        prior = niw(1, np.zeros(D), 100, np.eye(D) * 0.5)
        # run DPGMM

        if D == 2:
            start = timer()
            dpmm_splitnet_results = DPMMPython.fit(
                data,
                alpha,
                iterations=iters,
                burnout=burnout,
                verbose=False,
                init_type="splitnet_2d",
            )[0]
            dpmm_net_time = timer() - start

        elif D <= 10:
            start = timer()
            dpmm_splitnet_results = DPMMPython.fit(
                data,
                alpha,
                iterations=ITERS,
                burnout=BURNOUT,
                verbose=False,
                init_type="splitnet_10d",
            )[0]
            dpmm_net_time = timer() - start

        else:
            start = timer()
            dpmm_splitnet_results = DPMMPython.fit(
                data,
                alpha,
                iterations=ITERS,
                burnout=BURNOUT,
                verbose=False,
                init_type="splitnet_128d",
            )[0]
            dpmm_net_time = timer() - start

        if len(np.unique(dpmm_splitnet_results)) < K // 2:
            print("failed.")
        else:
            start = timer()
            dpmm_rand_results = DPMMPython.fit(
                data,
                alpha,
                iterations=iters,
                burnout=burnout,
                verbose=False,
                init_type="none",
            )[0]
            dpmm_rand_time = timer() - start

            start = timer()
            dpmm_kmeans_results = DPMMPython.fit(
                data,
                alpha,
                iterations=iters,
                burnout=burnout,
                verbose=False,
                init_type="kmeans",
            )[0]
            dpmm_kmeans_time = timer() - start

            # run kmeans
            start = timer()
            kmeans = KMeans(n_clusters=K).fit(data.T)
            kmeans_time = timer() - start
            kmeans_labels = kmeans.labels_

            # run GMM
            start = timer()
            gmm = GaussianMixture(n_components=K,
                                  covariance_type="full").fit(data.T)
            gmm_labels = gmm.predict(data.T)
            gmm_time = timer() - start

            # sklearn DPGMM
            start = timer()
            dpgmm = BayesianGaussianMixture(
                n_components=2 * K,
                covariance_type="full",
                weight_concentration_prior=alpha,
                weight_concentration_prior_type="dirichlet_process",
                mean_precision_prior=1e2,
                covariance_prior=1e0 * np.eye(D),
                init_params="kmeans",
                max_iter=iters,
                verbose=0,
            ).fit(data.T)
            dpgmm_labels = dpgmm.predict(data.T)
            dpgmmsk_time = timer() - start

            # moVB

            # pass data NxD
            data_bnpy = bnpy.data.XData(data.T)

            start = timer()
            model, run_info = bnpy.run(
                data_bnpy,
                "DPMixtureModel",
                "Gauss",
                "memoVB",
                nTask=1,
                nBatch=1,
                K=1,
                nLap=iters,
                moves="birth,merge,shuffle",
                gt=labels,
                gamma0=alpha,
            )

            moVB_time = timer() - start
            LP = model.calc_local_params(data_bnpy)
            moVB_labels = LP["resp"].argmax(axis=1)

            # calc metrics and aggregate
            results = add_results(results, "k-means", labels, kmeans_labels,
                                  kmeans_time)
            results = add_results(results, "EM-GMM", labels, gmm_labels,
                                  gmm_time)
            results = add_results(results, "DPGMM (SKlearn's)", labels,
                                  dpgmm_labels, dpgmmsk_time)
            results = add_results(results, "DPGMM-Random", labels,
                                  dpmm_rand_results, dpmm_rand_time)
            results = add_results(results, "DPGMM-k-means", labels,
                                  dpmm_kmeans_results, dpmm_kmeans_time)
            results = add_results(results, "DPGMM-SplitNet", labels,
                                  dpmm_splitnet_results, dpmm_net_time)
            results = add_results(results, "moVB", labels, moVB_labels,
                                  moVB_time)

            i += 1
            print(f"Finished iteration {i}")

    return results
Exemplo n.º 3
0
def run_datasets_comparisons(data, labels, alpha: int, iters: int,
                             burnout: int, repeats: int):

    results = {
        "method": [],
        "k_mae": [],
        "NMI": [],
        "ARI": [],
        #         "Acc": [],
    }

    N, D = data.shape
    K = len(np.unique(labels))

    for i in range(repeats):

        # run kmeans
        kmeans = KMeans(n_clusters=K).fit(data)
        kmeans_labels = kmeans.labels_

        # run GMM
        gmm = GaussianMixture(n_components=K, covariance_type="full").fit(data)
        gmm_labels = gmm.predict(data)

        # sklearn DPGMM
        dpgmm = BayesianGaussianMixture(
            n_components=2 * K,
            covariance_type="full",
            weight_concentration_prior=1e2,
            weight_concentration_prior_type="dirichlet_process",
            mean_precision_prior=1e2,
            covariance_prior=1e0 * np.eye(D),
            init_params="kmeans",
            max_iter=ITERS,
            verbose=0,
        ).fit(data)
        dpgmm_labels = dpgmm.predict(data)

        prior = niw(1, np.zeros(D), D + 2, np.eye(D) * 0.5)
        # run DPGMM
        #         dpmm_rand_results = DPMMPython.fit(data.T ,alpha, prior = prior, iterations=iters, burnout=burnout, verbose=False, init_type="none")[0]
        #         dpmm_kmeans_results = DPMMPython.fit(data.T ,alpha, prior = prior, iterations=iters, burnout=burnout, verbose=False, init_type="kmeans")[0]/
        dpmm_rand_results = DPMMPython.fit(
            data.T,
            alpha,
            iterations=iters,
            burnout=burnout,
            verbose=False,
            init_type="none",
        )[0]
        dpmm_kmeans_results = DPMMPython.fit(
            data.T,
            alpha,
            iterations=iters,
            burnout=burnout,
            verbose=False,
            init_type="kmeans",
        )[0]
        if D == 2:
            dpmm_splitnet_results = DPMMPython.fit(
                data.T,
                alpha,
                iterations=iters,
                burnout=burnout,
                verbose=False,
                init_type="splitnet_2d",
            )[0]
        elif D <= 10:
            dpmm_splitnet_results = DPMMPython.fit(
                data.T,
                alpha,
                iterations=ITERS,
                burnout=BURNOUT,
                verbose=False,
                init_type="splitnet_10d",
            )[0]
        else:
            dpmm_splitnet_results = DPMMPython.fit(
                data.T,
                alpha,
                iterations=ITERS,
                burnout=BURNOUT,
                verbose=False,
                init_type="splitnet_128d",
            )[0]

        # calc metrics and aggregate

        results = add_results(results, "k-means", labels, kmeans_labels)
        results = add_results(results, "EM-GMM", labels, gmm_labels)
        results = add_results(results, "DPGMM (SKlearn's)", labels,
                              dpgmm_labels)
        results = add_results(results, "DPGMM-Random", labels,
                              dpmm_rand_results)
        results = add_results(results, "DPGMM-k-means", labels,
                              dpmm_kmeans_results)
        results = add_results(results, "DPGMM-SplitNet", labels,
                              dpmm_splitnet_results)

        print(f"Finished iteration {i}")

    return results
# Plotting the data
#
# Note that as Julia is a column first language, the data generated is $DxN$

fig = plt.figure(figsize=(16, 10))
plt.scatter(data[0, :], data[1, :], c=labels, s=2, alpha=0.5, cmap="tab20")
# plt.colorbar()
# fig.colorbar(mpl.cm.ScalarMappable(norm=norm, cmap=cmap))
plt.show()

# ### Fiting DPGMM Model to the data
#
# Start by defining a niw prior and $\alpha$

prior = niw(1, np.zeros(D), 1000, np.eye(D))
alpha = 1000.0

# Fit the model and store the results in `results`.
# When working from Jupyter Notebook/Lab you will not see Julia prints. However when running python from terminal you will see all the prints (as in the Julia packages)

# results = DPMMPython.fit(data,alpha,prior=prior, iterations=300, burnout=5, verbose=False, init_type="kmeans")
# results = DPMMPython.fit(data,alpha, iterations=200, burnout=3, verbose=False, init_type="kmeans")
#
results = DPMMPython.fit(data,
                         alpha,
                         iterations=100,
                         burnout=5,
                         verbose=False,
                         init_type="splitnet_2d")
# results = DPMMPython.fit(data, alpha, prior=prior, iterations=300, burnout=3, verbose=False, init_type="splitnet_2d")
Exemplo n.º 5
0
        data = DPMMSubClusters.generate_gaussian_data(sample_count, dim,
                                                      components, var)
        gt = data[1]
        data = data[0]
        return data, gt

    @staticmethod
    def predict(model, data):
        '''
        Given a DPMM Model (which is located in fit(...)[2][-1] for backwards compatibility),
        predict the clusters for a data. The predict is using each cluster predictive posterior, 
        in contrary to the model itself during training, which sample from the posterior. 
        :params model: a DPMM (Julia object) model, returned from fit
        :data: The data in which to predict, DxN (similar to the fit argument)
        :return: labels
        '''
        return DPMMSubClusters.predict(model, data)


if __name__ == "__main__":
    j = julia.Julia()
    data, gt = DPMMPython.generate_gaussian_data(10000, 2, 10, 100.0)
    prior = niw(kappa=1, mu=np.ones(2) * 0, nu=3, psi=np.eye(2))
    # labels_j,_,sub_labels= DPMMPython.fit(data, 100, prior = prior, verbose = True, gt = gt, gpu = False)
    labels_j, _, sub_labels = DPMMPython.fit(data,
                                             100,
                                             prior=prior,
                                             verbose=True,
                                             gt=gt,
                                             gpu=True)