def _cluster_data(self, ps: int,
                      data: List[List],
                      ground_truth: List,
                      desired_k: int):
        clu_lss = Clusterer(dtm=data,
                            true_labels=ground_truth,
                            max_nbr_clusters=len(data)-1,
                            min_nbr_clusters=1,
                            min_cluster_size=2,
                            metric="cosine",
                            desired_n_clusters=desired_k)

        # Run SPKMeans 10 times to get mean performance
        # This is also what supplied the estimated k for the Clusterer
        # TODO: decouple k estimations from the evaluation
        norm_spk_pred, norm_spk_evals = clu_lss.evaluate(
                alg_option=Clusterer.alg_spherical_k_means,
                param_init="k-means++")

        cop_kmeans_pred, cop_kmeans_evals = clu_lss.evaluate(
            alg_option=Clusterer.alg_cop_kmeans,
            param_constraints_size=constraints_fraction,
            param_copkmeans_init="random")

        if include_older_algorithms:
            norm_hdbscan_pred, norm_hdbscan_evals = clu_lss.evaluate(
                    alg_option=Clusterer.alg_h_dbscan)

            norm_ms_pred, norm_ms_evals = clu_lss.evaluate(
                    alg_option=Clusterer.alg_mean_shift)

    #        norm_xm_pred, norm_xm_evals = clu_lss.evaluate(
    #                alg_option=Clusterer.alg_x_means)

            nhac_complete_pred, nhac_complete_evals = clu_lss.evaluate(
                    alg_option=Clusterer.alg_hac,
                    param_linkage="complete")

            nhac_s_pred, nhac_s_evals = clu_lss.evaluate(
                    alg_option=Clusterer.alg_hac,
                    param_linkage="single")

            nhac_a_pred, nhac_a_evals = clu_lss.evaluate(
                    alg_option=Clusterer.alg_hac,
                    param_linkage="average")

            n_optics_pred, n_optics_evals = clu_lss.evaluate(
                    alg_option=Clusterer.alg_optics)

        # Baselines
        bl_rand_pred, bl_rand_evals = clu_lss.evaluate(
                alg_option=Clusterer.bl_random)
        bl_singleton_pred, bl_singleton_evals = clu_lss.evaluate(
                alg_option=Clusterer.bl_singleton)

        nhdp_pred, nhdp_evals = clu_lss.eval_cluster_hdp()
        ntrue_pred, ntrue_evals = clu_lss.eval_true_clustering()

        # SOTA - Gomez et. al. HAC and Log-Entropy with 20k features
        # Not Applicable for Training data
        if not train_phase:
            sota_pred_path_le = (r"D:\College\DKEM\Thesis\AuthorshipClustering"
                                 r"\Code\clusterPAN2017-master\output_LogEnt"
                                 f"\\problem{ps:03d}\\clustering.json")
            sota_predicted_le = Tools.load_true_clusters_into_vector(
                    sota_pred_path_le)
            sota_pred_le, sota_evals_le = clu_lss.eval_sota(
                    sota_predicted=sota_predicted_le)

            sota_pred_path_tf = (r"D:\College\DKEM\Thesis\AuthorshipClustering"
                                 r"\Code\clusterPAN2017-master\output_Tf"
                                 f"\\problem{ps:03d}\\clustering.json")
            sota_predicted_tf = Tools.load_true_clusters_into_vector(
                    sota_pred_path_tf)
            sota_pred_tf, sota_evals_tf = clu_lss.eval_sota(
                    sota_predicted=sota_predicted_tf)

            sota_pred_path_tfidf = (
                r"D:\College\DKEM\Thesis\AuthorshipClustering"
                r"\Code\clusterPAN2017-master\output_TfIdf"
                f"\\problem{ps:03d}\\clustering.json")
            sota_predicted_tfidf = Tools.load_true_clusters_into_vector(
                    sota_pred_path_tfidf)
            sota_pred_tfidf, sota_evals_tfidf = clu_lss.eval_sota(
                    sota_predicted=sota_predicted_tfidf)
        else:
            # Build some placeholders only as SOTA isn't required to train
            # sota_pred_le = [0] * len(data)
            # sota_pred_tf = [0] * len(data)
            # sota_pred_tfidf = [0] * len(data)
            placebo_ret = {}
            placebo_ret.update({"nmi": None,
                                "ami": None,
                                "ari": None,
                                "fms": None,
                                "v_measure": None,
                                "bcubed_precision": None,
                                "bcubed_recall": None,
                                "bcubed_fscore": None,
                                "Silhouette": None,
                                "Calinski_harabasz": None,
                                "Davies_Bouldin": None
                                # Here goes the unsupervised indices
                                })
            sota_evals_le = placebo_ret
            sota_evals_tf = placebo_ret
            sota_evals_tfidf = placebo_ret

        # Control whether k is estimated or it is the true k replicated:
        if desired_k != 0:
            k_trend = clu_lss.cand_k
            k_trend.append(1 + max(clu_lss.true_labels))
        else:
            k_trend = [1 + max(clu_lss.true_labels)
                       ] * (nbr_competing_methods + 1)

        result = Tools.form_problemset_result_dictionary(
                dictionaries=[
                        # ispk_evals, norm_spk_evals, norm_hdbscan_evals,
                        norm_spk_evals, norm_hdbscan_evals,
                        norm_ms_evals,  # norm_xm_evals,
                        nhac_complete_evals, nhac_s_evals, nhac_a_evals,
                        n_optics_evals, cop_kmeans_evals,
                        bl_rand_evals, bl_singleton_evals,
                        nhdp_evals,
                        sota_evals_tf, sota_evals_tfidf, sota_evals_le,
                        ntrue_evals
                        ],
                identifiers=[  # "iSpKmeans",
                             "E_SPKMeans", "E_HDBSCAN",
                             "E_Mean_Shift",  # "XMeans",
                             "E_HAC_C", "E_HAC_Single", "E_HAC_Average",
                             "E_OPTICS", "E_COP_KMeans",
                             "BL_r", "BL_s", "S_HDP",
                             "BL_SOTA_tf", "BL_SOTA_tfidf", "BL_SOTA_le",
                             "Labels"],
                problem_set=ps)

        return result, k_trend
def problem_set_run(problem_set_id: int,
                    n_clusters: int,
                    seed: int,
                    configuration: str,
                    drop_uncommon: bool,
                    verbose: bool,
                    infer_lss: bool = False):
    problem_nbr = f"{problem_set_id:03d}"
    # Define an LSS modeller to represent documents in LSS non-sparse space
    # HDP with Gibbs sampler is being used as is from:
    #   https://github.com/blei-lab/hdp

    # Adjust the parameters according to the preference
    if configuration == config_sparse:
        eta = 0.3
        gamma = 0.1
        alpha = 0.1
    elif configuration == config_dense:
        eta = 0.8
        gamma = 1.5
        alpha = 1.5
    else:
        eta = 0.5
        gamma = 1.0
        alpha = 1.0

    Modeller = LssHdpModeller(
            hdp_path=r"..\hdps\hdp",
            input_docs_path=r"..\..\Datasets\pan17_train\problem{}".format(
                    problem_nbr),
            ldac_filename=r"ldac_corpus",
            hdp_output_dir=r"hdp_lss",
            hdp_iters=10000,
            hdp_seed=seed,
            hdp_sample_hyper=False,
            hdp_eta=eta,
            hdp_gamma_s=gamma,
            hdp_alpha_s=alpha,
            word_grams=1,
            drop_uncommon=drop_uncommon,
            freq_threshold=1,
            verbose=verbose)

    # Infer the BoW and LSS representations of the documents
    try:
        # Load, project and visualise the data
        plain_docs, bow_rep_docs, lss_rep_docs = Modeller.get_corpus_lss(
                infer_lss,
                bim=False)

        # Begin Clustering Attempts
        true_labels_path = (r"..\..\Datasets\pan17_train\truth"
                            r"\problem{}\clustering.json"
                            ).format(problem_nbr)

        ground_truth = Tools.load_true_clusters_into_vector(true_labels_path)

        # Normalise the data if not BIM is used!
        clu_lss = Clusterer(dtm=Tools.normalise_data(data=lss_rep_docs),
                            true_labels=ground_truth,
                            max_nbr_clusters=len(lss_rep_docs)-1,
                            min_nbr_clusters=1,
                            min_cluster_size=2,
                            metric="cosine",
                            desired_n_clusters=n_clusters)

        norm_spk_pred, norm_spk_evals = clu_lss.evaluate(
                alg_option=Clusterer.alg_spherical_k_means,
                param_init="k-means++")

#        ispk_pred, ispk_evals = clu_lss.evaluate(
#                alg_option=Clusterer.alg_iterative_spherical_k_means,
#                param_init="k-means++")

        norm_hdbscan_pred, norm_hdbscan_evals = clu_lss.evaluate(
                alg_option=Clusterer.alg_h_dbscan)

        norm_ms_pred, norm_ms_evals = clu_lss.evaluate(
                alg_option=Clusterer.alg_mean_shift)

#        norm_xm_pred, norm_xm_evals = clu_lss.evaluate(
#                alg_option=Clusterer.alg_x_means)

        nhac_complete_pred, nhac_complete_evals = clu_lss.evaluate(
                alg_option=Clusterer.alg_hac,
                param_linkage="complete")

        nhac_s_pred, nhac_s_evals = clu_lss.evaluate(
                alg_option=Clusterer.alg_hac,
                param_linkage="single")

        nhac_a_pred, nhac_a_evals = clu_lss.evaluate(
                alg_option=Clusterer.alg_hac,
                param_linkage="average")

        n_optics_pred, n_optics_evals = clu_lss.evaluate(
                alg_option=Clusterer.alg_optics)

        # Baselines
        bl_rand_pred, bl_rand_evals = clu_lss.evaluate(
                alg_option=Clusterer.bl_random)
        bl_singleton_pred, bl_singleton_evals = clu_lss.evaluate(
                alg_option=Clusterer.bl_singleton)

        nhdp_pred, nhdp_evals = clu_lss.eval_cluster_hdp()
        ntrue_pred, ntrue_evals = clu_lss.eval_true_clustering()

        # SOTA - Gomez et. al. HAC and Log-Entropy with 20k features
        sota_pred_path = (r"D:\College\DKEM\Thesis\AuthorshipClustering\Code"
                          r"\clusterPAN2017-master\train_out_LogEnt"
                          f"\\problem{problem_nbr}\\clustering.json")
        sota_predicted = Tools.load_true_clusters_into_vector(sota_pred_path)
        sota_pred, sota_evals = clu_lss.eval_sota(
                sota_predicted=sota_predicted)

        # Return the results:
        return (Tools.form_problemset_result_dictionary(
                dictionaries=[
                        # ispk_evals, norm_spk_evals, norm_hdbscan_evals,
                        norm_spk_evals, norm_hdbscan_evals,
                        norm_ms_evals,  # norm_xm_evals,
                        nhac_complete_evals, nhac_s_evals, nhac_a_evals,
                        n_optics_evals, bl_rand_evals, bl_singleton_evals,
                        nhdp_evals, sota_evals, ntrue_evals
                        ],
                identifiers=[  # "iSpKmeans",
                             "E_SPKMeans", "E_HDBSCAN",
                             "E_Mean_Shift",  # "XMeans",
                             "E_HAC_C", "E_HAC_Single", "E_HAC_Average",
                             "E_OPTICS", "BL_r", "BL_s",
                             "S_HDP", "BL_SOTA", "Labels"],
                problem_set=problem_set_id),
                ground_truth,
                lss_rep_docs,
                plain_docs,
                clu_lss)

    except FileNotFoundError:
        print("Please run HDP on these data first.")