def create_prior(dim, mean_prior, mean_str, cov_prior, cov_str): """ Creates a gaussian prior, if cov_prior is a scalar, then creates an isotropic prior scaled to that, if its a matrix uses it as covariance :param dim: data dimension :param mean_prior: if a scalar, will create a vector scaled to that, if its a vector then use it as the prior mean :param mean_str: prior mean psuedo count :param cov_prior: if a scalar, will create an isotropic covariance scaled to cov_prior, if a matrix will use it as the covariance. :param cov_str: prior covariance psuedo counts :return: DPMMSubClusters.niw_hyperparams prior """ if isinstance(mean_prior, (int, float)): prior_mean = np.ones(dim) * mean_prior else: prior_mean = mean_prior if isinstance(cov_prior, (int, float)): prior_covariance = np.eye(dim) * cov_prior else: prior_covariance = cov_prior prior = niw(mean_str, prior_mean, dim + cov_str, prior_covariance) return prior
def run_synthetic_data_comparisons( D: int, K: int, N: int, var_scale: int, alpha: int, iters: int, burnout: int, repeats: int, ): results = { "method": [], "k_mae": [], "NMI": [], "ARI": [], "Time": [], } i = 0 while i < repeats: # generate dataset data, labels = DPMMPython.generate_gaussian_data(N, D, K, var_scale) prior = niw(1, np.zeros(D), 100, np.eye(D) * 0.5) # run DPGMM if D == 2: start = timer() dpmm_splitnet_results = DPMMPython.fit( data, alpha, iterations=iters, burnout=burnout, verbose=False, init_type="splitnet_2d", )[0] dpmm_net_time = timer() - start elif D <= 10: start = timer() dpmm_splitnet_results = DPMMPython.fit( data, alpha, iterations=ITERS, burnout=BURNOUT, verbose=False, init_type="splitnet_10d", )[0] dpmm_net_time = timer() - start else: start = timer() dpmm_splitnet_results = DPMMPython.fit( data, alpha, iterations=ITERS, burnout=BURNOUT, verbose=False, init_type="splitnet_128d", )[0] dpmm_net_time = timer() - start if len(np.unique(dpmm_splitnet_results)) < K // 2: print("failed.") else: start = timer() dpmm_rand_results = DPMMPython.fit( data, alpha, iterations=iters, burnout=burnout, verbose=False, init_type="none", )[0] dpmm_rand_time = timer() - start start = timer() dpmm_kmeans_results = DPMMPython.fit( data, alpha, iterations=iters, burnout=burnout, verbose=False, init_type="kmeans", )[0] dpmm_kmeans_time = timer() - start # run kmeans start = timer() kmeans = KMeans(n_clusters=K).fit(data.T) kmeans_time = timer() - start kmeans_labels = kmeans.labels_ # run GMM start = timer() gmm = GaussianMixture(n_components=K, covariance_type="full").fit(data.T) gmm_labels = gmm.predict(data.T) gmm_time = timer() - start # sklearn DPGMM start = timer() dpgmm = BayesianGaussianMixture( n_components=2 * K, covariance_type="full", weight_concentration_prior=alpha, weight_concentration_prior_type="dirichlet_process", mean_precision_prior=1e2, covariance_prior=1e0 * np.eye(D), init_params="kmeans", max_iter=iters, verbose=0, ).fit(data.T) dpgmm_labels = dpgmm.predict(data.T) dpgmmsk_time = timer() - start # moVB # pass data NxD data_bnpy = bnpy.data.XData(data.T) start = timer() model, run_info = bnpy.run( data_bnpy, "DPMixtureModel", "Gauss", "memoVB", nTask=1, nBatch=1, K=1, nLap=iters, moves="birth,merge,shuffle", gt=labels, gamma0=alpha, ) moVB_time = timer() - start LP = model.calc_local_params(data_bnpy) moVB_labels = LP["resp"].argmax(axis=1) # calc metrics and aggregate results = add_results(results, "k-means", labels, kmeans_labels, kmeans_time) results = add_results(results, "EM-GMM", labels, gmm_labels, gmm_time) results = add_results(results, "DPGMM (SKlearn's)", labels, dpgmm_labels, dpgmmsk_time) results = add_results(results, "DPGMM-Random", labels, dpmm_rand_results, dpmm_rand_time) results = add_results(results, "DPGMM-k-means", labels, dpmm_kmeans_results, dpmm_kmeans_time) results = add_results(results, "DPGMM-SplitNet", labels, dpmm_splitnet_results, dpmm_net_time) results = add_results(results, "moVB", labels, moVB_labels, moVB_time) i += 1 print(f"Finished iteration {i}") return results
def run_datasets_comparisons(data, labels, alpha: int, iters: int, burnout: int, repeats: int): results = { "method": [], "k_mae": [], "NMI": [], "ARI": [], # "Acc": [], } N, D = data.shape K = len(np.unique(labels)) for i in range(repeats): # run kmeans kmeans = KMeans(n_clusters=K).fit(data) kmeans_labels = kmeans.labels_ # run GMM gmm = GaussianMixture(n_components=K, covariance_type="full").fit(data) gmm_labels = gmm.predict(data) # sklearn DPGMM dpgmm = BayesianGaussianMixture( n_components=2 * K, covariance_type="full", weight_concentration_prior=1e2, weight_concentration_prior_type="dirichlet_process", mean_precision_prior=1e2, covariance_prior=1e0 * np.eye(D), init_params="kmeans", max_iter=ITERS, verbose=0, ).fit(data) dpgmm_labels = dpgmm.predict(data) prior = niw(1, np.zeros(D), D + 2, np.eye(D) * 0.5) # run DPGMM # dpmm_rand_results = DPMMPython.fit(data.T ,alpha, prior = prior, iterations=iters, burnout=burnout, verbose=False, init_type="none")[0] # dpmm_kmeans_results = DPMMPython.fit(data.T ,alpha, prior = prior, iterations=iters, burnout=burnout, verbose=False, init_type="kmeans")[0]/ dpmm_rand_results = DPMMPython.fit( data.T, alpha, iterations=iters, burnout=burnout, verbose=False, init_type="none", )[0] dpmm_kmeans_results = DPMMPython.fit( data.T, alpha, iterations=iters, burnout=burnout, verbose=False, init_type="kmeans", )[0] if D == 2: dpmm_splitnet_results = DPMMPython.fit( data.T, alpha, iterations=iters, burnout=burnout, verbose=False, init_type="splitnet_2d", )[0] elif D <= 10: dpmm_splitnet_results = DPMMPython.fit( data.T, alpha, iterations=ITERS, burnout=BURNOUT, verbose=False, init_type="splitnet_10d", )[0] else: dpmm_splitnet_results = DPMMPython.fit( data.T, alpha, iterations=ITERS, burnout=BURNOUT, verbose=False, init_type="splitnet_128d", )[0] # calc metrics and aggregate results = add_results(results, "k-means", labels, kmeans_labels) results = add_results(results, "EM-GMM", labels, gmm_labels) results = add_results(results, "DPGMM (SKlearn's)", labels, dpgmm_labels) results = add_results(results, "DPGMM-Random", labels, dpmm_rand_results) results = add_results(results, "DPGMM-k-means", labels, dpmm_kmeans_results) results = add_results(results, "DPGMM-SplitNet", labels, dpmm_splitnet_results) print(f"Finished iteration {i}") return results
# Plotting the data # # Note that as Julia is a column first language, the data generated is $DxN$ fig = plt.figure(figsize=(16, 10)) plt.scatter(data[0, :], data[1, :], c=labels, s=2, alpha=0.5, cmap="tab20") # plt.colorbar() # fig.colorbar(mpl.cm.ScalarMappable(norm=norm, cmap=cmap)) plt.show() # ### Fiting DPGMM Model to the data # # Start by defining a niw prior and $\alpha$ prior = niw(1, np.zeros(D), 1000, np.eye(D)) alpha = 1000.0 # Fit the model and store the results in `results`. # When working from Jupyter Notebook/Lab you will not see Julia prints. However when running python from terminal you will see all the prints (as in the Julia packages) # results = DPMMPython.fit(data,alpha,prior=prior, iterations=300, burnout=5, verbose=False, init_type="kmeans") # results = DPMMPython.fit(data,alpha, iterations=200, burnout=3, verbose=False, init_type="kmeans") # results = DPMMPython.fit(data, alpha, iterations=100, burnout=5, verbose=False, init_type="splitnet_2d") # results = DPMMPython.fit(data, alpha, prior=prior, iterations=300, burnout=3, verbose=False, init_type="splitnet_2d")
data = DPMMSubClusters.generate_gaussian_data(sample_count, dim, components, var) gt = data[1] data = data[0] return data, gt @staticmethod def predict(model, data): ''' Given a DPMM Model (which is located in fit(...)[2][-1] for backwards compatibility), predict the clusters for a data. The predict is using each cluster predictive posterior, in contrary to the model itself during training, which sample from the posterior. :params model: a DPMM (Julia object) model, returned from fit :data: The data in which to predict, DxN (similar to the fit argument) :return: labels ''' return DPMMSubClusters.predict(model, data) if __name__ == "__main__": j = julia.Julia() data, gt = DPMMPython.generate_gaussian_data(10000, 2, 10, 100.0) prior = niw(kappa=1, mu=np.ones(2) * 0, nu=3, psi=np.eye(2)) # labels_j,_,sub_labels= DPMMPython.fit(data, 100, prior = prior, verbose = True, gt = gt, gpu = False) labels_j, _, sub_labels = DPMMPython.fit(data, 100, prior=prior, verbose=True, gt=gt, gpu=True)