def _cluster_data(self, ps: int, data: List[List], ground_truth: List, desired_k: int): clu_lss = Clusterer(dtm=data, true_labels=ground_truth, max_nbr_clusters=len(data)-1, min_nbr_clusters=1, min_cluster_size=2, metric="cosine", desired_n_clusters=desired_k) # Run SPKMeans 10 times to get mean performance # This is also what supplied the estimated k for the Clusterer # TODO: decouple k estimations from the evaluation norm_spk_pred, norm_spk_evals = clu_lss.evaluate( alg_option=Clusterer.alg_spherical_k_means, param_init="k-means++") cop_kmeans_pred, cop_kmeans_evals = clu_lss.evaluate( alg_option=Clusterer.alg_cop_kmeans, param_constraints_size=constraints_fraction, param_copkmeans_init="random") if include_older_algorithms: norm_hdbscan_pred, norm_hdbscan_evals = clu_lss.evaluate( alg_option=Clusterer.alg_h_dbscan) norm_ms_pred, norm_ms_evals = clu_lss.evaluate( alg_option=Clusterer.alg_mean_shift) # norm_xm_pred, norm_xm_evals = clu_lss.evaluate( # alg_option=Clusterer.alg_x_means) nhac_complete_pred, nhac_complete_evals = clu_lss.evaluate( alg_option=Clusterer.alg_hac, param_linkage="complete") nhac_s_pred, nhac_s_evals = clu_lss.evaluate( alg_option=Clusterer.alg_hac, param_linkage="single") nhac_a_pred, nhac_a_evals = clu_lss.evaluate( alg_option=Clusterer.alg_hac, param_linkage="average") n_optics_pred, n_optics_evals = clu_lss.evaluate( alg_option=Clusterer.alg_optics) # Baselines bl_rand_pred, bl_rand_evals = clu_lss.evaluate( alg_option=Clusterer.bl_random) bl_singleton_pred, bl_singleton_evals = clu_lss.evaluate( alg_option=Clusterer.bl_singleton) nhdp_pred, nhdp_evals = clu_lss.eval_cluster_hdp() ntrue_pred, ntrue_evals = clu_lss.eval_true_clustering() # SOTA - Gomez et. al. HAC and Log-Entropy with 20k features # Not Applicable for Training data if not train_phase: sota_pred_path_le = (r"D:\College\DKEM\Thesis\AuthorshipClustering" r"\Code\clusterPAN2017-master\output_LogEnt" f"\\problem{ps:03d}\\clustering.json") sota_predicted_le = Tools.load_true_clusters_into_vector( sota_pred_path_le) sota_pred_le, sota_evals_le = clu_lss.eval_sota( sota_predicted=sota_predicted_le) sota_pred_path_tf = (r"D:\College\DKEM\Thesis\AuthorshipClustering" r"\Code\clusterPAN2017-master\output_Tf" f"\\problem{ps:03d}\\clustering.json") sota_predicted_tf = Tools.load_true_clusters_into_vector( sota_pred_path_tf) sota_pred_tf, sota_evals_tf = clu_lss.eval_sota( sota_predicted=sota_predicted_tf) sota_pred_path_tfidf = ( r"D:\College\DKEM\Thesis\AuthorshipClustering" r"\Code\clusterPAN2017-master\output_TfIdf" f"\\problem{ps:03d}\\clustering.json") sota_predicted_tfidf = Tools.load_true_clusters_into_vector( sota_pred_path_tfidf) sota_pred_tfidf, sota_evals_tfidf = clu_lss.eval_sota( sota_predicted=sota_predicted_tfidf) else: # Build some placeholders only as SOTA isn't required to train # sota_pred_le = [0] * len(data) # sota_pred_tf = [0] * len(data) # sota_pred_tfidf = [0] * len(data) placebo_ret = {} placebo_ret.update({"nmi": None, "ami": None, "ari": None, "fms": None, "v_measure": None, "bcubed_precision": None, "bcubed_recall": None, "bcubed_fscore": None, "Silhouette": None, "Calinski_harabasz": None, "Davies_Bouldin": None # Here goes the unsupervised indices }) sota_evals_le = placebo_ret sota_evals_tf = placebo_ret sota_evals_tfidf = placebo_ret # Control whether k is estimated or it is the true k replicated: if desired_k != 0: k_trend = clu_lss.cand_k k_trend.append(1 + max(clu_lss.true_labels)) else: k_trend = [1 + max(clu_lss.true_labels) ] * (nbr_competing_methods + 1) result = Tools.form_problemset_result_dictionary( dictionaries=[ # ispk_evals, norm_spk_evals, norm_hdbscan_evals, norm_spk_evals, norm_hdbscan_evals, norm_ms_evals, # norm_xm_evals, nhac_complete_evals, nhac_s_evals, nhac_a_evals, n_optics_evals, cop_kmeans_evals, bl_rand_evals, bl_singleton_evals, nhdp_evals, sota_evals_tf, sota_evals_tfidf, sota_evals_le, ntrue_evals ], identifiers=[ # "iSpKmeans", "E_SPKMeans", "E_HDBSCAN", "E_Mean_Shift", # "XMeans", "E_HAC_C", "E_HAC_Single", "E_HAC_Average", "E_OPTICS", "E_COP_KMeans", "BL_r", "BL_s", "S_HDP", "BL_SOTA_tf", "BL_SOTA_tfidf", "BL_SOTA_le", "Labels"], problem_set=ps) return result, k_trend
def problem_set_run(problem_set_id: int, n_clusters: int, seed: int, configuration: str, drop_uncommon: bool, verbose: bool, infer_lss: bool = False): problem_nbr = f"{problem_set_id:03d}" # Define an LSS modeller to represent documents in LSS non-sparse space # HDP with Gibbs sampler is being used as is from: # https://github.com/blei-lab/hdp # Adjust the parameters according to the preference if configuration == config_sparse: eta = 0.3 gamma = 0.1 alpha = 0.1 elif configuration == config_dense: eta = 0.8 gamma = 1.5 alpha = 1.5 else: eta = 0.5 gamma = 1.0 alpha = 1.0 Modeller = LssHdpModeller( hdp_path=r"..\hdps\hdp", input_docs_path=r"..\..\Datasets\pan17_train\problem{}".format( problem_nbr), ldac_filename=r"ldac_corpus", hdp_output_dir=r"hdp_lss", hdp_iters=10000, hdp_seed=seed, hdp_sample_hyper=False, hdp_eta=eta, hdp_gamma_s=gamma, hdp_alpha_s=alpha, word_grams=1, drop_uncommon=drop_uncommon, freq_threshold=1, verbose=verbose) # Infer the BoW and LSS representations of the documents try: # Load, project and visualise the data plain_docs, bow_rep_docs, lss_rep_docs = Modeller.get_corpus_lss( infer_lss, bim=False) # Begin Clustering Attempts true_labels_path = (r"..\..\Datasets\pan17_train\truth" r"\problem{}\clustering.json" ).format(problem_nbr) ground_truth = Tools.load_true_clusters_into_vector(true_labels_path) # Normalise the data if not BIM is used! clu_lss = Clusterer(dtm=Tools.normalise_data(data=lss_rep_docs), true_labels=ground_truth, max_nbr_clusters=len(lss_rep_docs)-1, min_nbr_clusters=1, min_cluster_size=2, metric="cosine", desired_n_clusters=n_clusters) norm_spk_pred, norm_spk_evals = clu_lss.evaluate( alg_option=Clusterer.alg_spherical_k_means, param_init="k-means++") # ispk_pred, ispk_evals = clu_lss.evaluate( # alg_option=Clusterer.alg_iterative_spherical_k_means, # param_init="k-means++") norm_hdbscan_pred, norm_hdbscan_evals = clu_lss.evaluate( alg_option=Clusterer.alg_h_dbscan) norm_ms_pred, norm_ms_evals = clu_lss.evaluate( alg_option=Clusterer.alg_mean_shift) # norm_xm_pred, norm_xm_evals = clu_lss.evaluate( # alg_option=Clusterer.alg_x_means) nhac_complete_pred, nhac_complete_evals = clu_lss.evaluate( alg_option=Clusterer.alg_hac, param_linkage="complete") nhac_s_pred, nhac_s_evals = clu_lss.evaluate( alg_option=Clusterer.alg_hac, param_linkage="single") nhac_a_pred, nhac_a_evals = clu_lss.evaluate( alg_option=Clusterer.alg_hac, param_linkage="average") n_optics_pred, n_optics_evals = clu_lss.evaluate( alg_option=Clusterer.alg_optics) # Baselines bl_rand_pred, bl_rand_evals = clu_lss.evaluate( alg_option=Clusterer.bl_random) bl_singleton_pred, bl_singleton_evals = clu_lss.evaluate( alg_option=Clusterer.bl_singleton) nhdp_pred, nhdp_evals = clu_lss.eval_cluster_hdp() ntrue_pred, ntrue_evals = clu_lss.eval_true_clustering() # SOTA - Gomez et. al. HAC and Log-Entropy with 20k features sota_pred_path = (r"D:\College\DKEM\Thesis\AuthorshipClustering\Code" r"\clusterPAN2017-master\train_out_LogEnt" f"\\problem{problem_nbr}\\clustering.json") sota_predicted = Tools.load_true_clusters_into_vector(sota_pred_path) sota_pred, sota_evals = clu_lss.eval_sota( sota_predicted=sota_predicted) # Return the results: return (Tools.form_problemset_result_dictionary( dictionaries=[ # ispk_evals, norm_spk_evals, norm_hdbscan_evals, norm_spk_evals, norm_hdbscan_evals, norm_ms_evals, # norm_xm_evals, nhac_complete_evals, nhac_s_evals, nhac_a_evals, n_optics_evals, bl_rand_evals, bl_singleton_evals, nhdp_evals, sota_evals, ntrue_evals ], identifiers=[ # "iSpKmeans", "E_SPKMeans", "E_HDBSCAN", "E_Mean_Shift", # "XMeans", "E_HAC_C", "E_HAC_Single", "E_HAC_Average", "E_OPTICS", "BL_r", "BL_s", "S_HDP", "BL_SOTA", "Labels"], problem_set=problem_set_id), ground_truth, lss_rep_docs, plain_docs, clu_lss) except FileNotFoundError: print("Please run HDP on these data first.")