def main(): #start Stanford NER p = subprocess.Popen("java -mx1000m -cp stanford-ner/stanford-ner.jar edu.stanford.nlp.ie.NERServer -loadClassifier stanford-ner/classifiers/english.all.3class.distsim.crf.ser.gz -port 1239", shell=True) #wait ten sec to make sure NER is up and running time.sleep(10) datafile = open('data/data.json', 'r') data = json.load(datafile) documents = [] for article in data['articles']: document = Document(article) documents.append(document) distances.register_all_distances() d = distances.get_distance("Cosine") clus = Clusterer(documents, d, int(sys.argv[1])) clus.process_documents() print "Clustering finished ======================================================= \n" clus.print_all_topics() #kill NER os.kill(p.pid+1, 9)
def _cluster_data(self, ps: int, data: List[List], ground_truth: List, desired_k: int): clu_lss = Clusterer(dtm=data, true_labels=ground_truth, max_nbr_clusters=len(data)-1, min_nbr_clusters=1, min_cluster_size=2, metric="cosine", desired_n_clusters=desired_k) # Run SPKMeans 10 times to get mean performance # This is also what supplied the estimated k for the Clusterer # TODO: decouple k estimations from the evaluation norm_spk_pred, norm_spk_evals = clu_lss.evaluate( alg_option=Clusterer.alg_spherical_k_means, param_init="k-means++") cop_kmeans_pred, cop_kmeans_evals = clu_lss.evaluate( alg_option=Clusterer.alg_cop_kmeans, param_constraints_size=constraints_fraction, param_copkmeans_init="random") if include_older_algorithms: norm_hdbscan_pred, norm_hdbscan_evals = clu_lss.evaluate( alg_option=Clusterer.alg_h_dbscan) norm_ms_pred, norm_ms_evals = clu_lss.evaluate( alg_option=Clusterer.alg_mean_shift) # norm_xm_pred, norm_xm_evals = clu_lss.evaluate( # alg_option=Clusterer.alg_x_means) nhac_complete_pred, nhac_complete_evals = clu_lss.evaluate( alg_option=Clusterer.alg_hac, param_linkage="complete") nhac_s_pred, nhac_s_evals = clu_lss.evaluate( alg_option=Clusterer.alg_hac, param_linkage="single") nhac_a_pred, nhac_a_evals = clu_lss.evaluate( alg_option=Clusterer.alg_hac, param_linkage="average") n_optics_pred, n_optics_evals = clu_lss.evaluate( alg_option=Clusterer.alg_optics) # Baselines bl_rand_pred, bl_rand_evals = clu_lss.evaluate( alg_option=Clusterer.bl_random) bl_singleton_pred, bl_singleton_evals = clu_lss.evaluate( alg_option=Clusterer.bl_singleton) nhdp_pred, nhdp_evals = clu_lss.eval_cluster_hdp() ntrue_pred, ntrue_evals = clu_lss.eval_true_clustering() # SOTA - Gomez et. al. HAC and Log-Entropy with 20k features # Not Applicable for Training data if not train_phase: sota_pred_path_le = (r"D:\College\DKEM\Thesis\AuthorshipClustering" r"\Code\clusterPAN2017-master\output_LogEnt" f"\\problem{ps:03d}\\clustering.json") sota_predicted_le = Tools.load_true_clusters_into_vector( sota_pred_path_le) sota_pred_le, sota_evals_le = clu_lss.eval_sota( sota_predicted=sota_predicted_le) sota_pred_path_tf = (r"D:\College\DKEM\Thesis\AuthorshipClustering" r"\Code\clusterPAN2017-master\output_Tf" f"\\problem{ps:03d}\\clustering.json") sota_predicted_tf = Tools.load_true_clusters_into_vector( sota_pred_path_tf) sota_pred_tf, sota_evals_tf = clu_lss.eval_sota( sota_predicted=sota_predicted_tf) sota_pred_path_tfidf = ( r"D:\College\DKEM\Thesis\AuthorshipClustering" r"\Code\clusterPAN2017-master\output_TfIdf" f"\\problem{ps:03d}\\clustering.json") sota_predicted_tfidf = Tools.load_true_clusters_into_vector( sota_pred_path_tfidf) sota_pred_tfidf, sota_evals_tfidf = clu_lss.eval_sota( sota_predicted=sota_predicted_tfidf) else: # Build some placeholders only as SOTA isn't required to train # sota_pred_le = [0] * len(data) # sota_pred_tf = [0] * len(data) # sota_pred_tfidf = [0] * len(data) placebo_ret = {} placebo_ret.update({"nmi": None, "ami": None, "ari": None, "fms": None, "v_measure": None, "bcubed_precision": None, "bcubed_recall": None, "bcubed_fscore": None, "Silhouette": None, "Calinski_harabasz": None, "Davies_Bouldin": None # Here goes the unsupervised indices }) sota_evals_le = placebo_ret sota_evals_tf = placebo_ret sota_evals_tfidf = placebo_ret # Control whether k is estimated or it is the true k replicated: if desired_k != 0: k_trend = clu_lss.cand_k k_trend.append(1 + max(clu_lss.true_labels)) else: k_trend = [1 + max(clu_lss.true_labels) ] * (nbr_competing_methods + 1) result = Tools.form_problemset_result_dictionary( dictionaries=[ # ispk_evals, norm_spk_evals, norm_hdbscan_evals, norm_spk_evals, norm_hdbscan_evals, norm_ms_evals, # norm_xm_evals, nhac_complete_evals, nhac_s_evals, nhac_a_evals, n_optics_evals, cop_kmeans_evals, bl_rand_evals, bl_singleton_evals, nhdp_evals, sota_evals_tf, sota_evals_tfidf, sota_evals_le, ntrue_evals ], identifiers=[ # "iSpKmeans", "E_SPKMeans", "E_HDBSCAN", "E_Mean_Shift", # "XMeans", "E_HAC_C", "E_HAC_Single", "E_HAC_Average", "E_OPTICS", "E_COP_KMeans", "BL_r", "BL_s", "S_HDP", "BL_SOTA_tf", "BL_SOTA_tfidf", "BL_SOTA_le", "Labels"], problem_set=ps) return result, k_trend
from clustering import Clusterer from confo.home.models import * from dblogging import LogOrCache import sys TOPN = 10 VECTOR_SIZE = 20 UNIQUE_WORDS_MAX = 2500 def get_table(mode): return LogOrCache(["fromauth","toauth","similarity"], "similar_authors.txt", mode, "similar_authors") def clear_table(): SimilarAuthors.objects.all().delete() if __name__ == '__main__': if len(sys.argv) != 2: print "argument: path_to_clustering_python_bin" sys.exit(-1) q = """SELECT pa.author_id, w.word, count(*) AS total from words as w, papers_authors as pa where pa.paper_id = w.pid group by pa.author_id, w.word order by pa.author_id, total DESC;""" c = Clusterer(get_table, clear_table, 'fromauth', 'toauth', q, TOPN, VECTOR_SIZE, UNIQUE_WORDS_MAX, sys.argv[1]) c.build_similar()
def problem_set_run(problem_set_id: int, n_clusters: int, seed: int, configuration: str, drop_uncommon: bool, verbose: bool, infer_lss: bool = False): problem_nbr = f"{problem_set_id:03d}" # Define an LSS modeller to represent documents in LSS non-sparse space # HDP with Gibbs sampler is being used as is from: # https://github.com/blei-lab/hdp # Adjust the parameters according to the preference if configuration == config_sparse: eta = 0.3 gamma = 0.1 alpha = 0.1 elif configuration == config_dense: eta = 0.8 gamma = 1.5 alpha = 1.5 else: eta = 0.5 gamma = 1.0 alpha = 1.0 Modeller = LssHdpModeller( hdp_path=r"..\hdps\hdp", input_docs_path=r"..\..\Datasets\pan17_train\problem{}".format( problem_nbr), ldac_filename=r"ldac_corpus", hdp_output_dir=r"hdp_lss", hdp_iters=10000, hdp_seed=seed, hdp_sample_hyper=False, hdp_eta=eta, hdp_gamma_s=gamma, hdp_alpha_s=alpha, word_grams=1, drop_uncommon=drop_uncommon, freq_threshold=1, verbose=verbose) # Infer the BoW and LSS representations of the documents try: # Load, project and visualise the data plain_docs, bow_rep_docs, lss_rep_docs = Modeller.get_corpus_lss( infer_lss, bim=False) # Begin Clustering Attempts true_labels_path = (r"..\..\Datasets\pan17_train\truth" r"\problem{}\clustering.json" ).format(problem_nbr) ground_truth = Tools.load_true_clusters_into_vector(true_labels_path) # Normalise the data if not BIM is used! clu_lss = Clusterer(dtm=Tools.normalise_data(data=lss_rep_docs), true_labels=ground_truth, max_nbr_clusters=len(lss_rep_docs)-1, min_nbr_clusters=1, min_cluster_size=2, metric="cosine", desired_n_clusters=n_clusters) norm_spk_pred, norm_spk_evals = clu_lss.evaluate( alg_option=Clusterer.alg_spherical_k_means, param_init="k-means++") # ispk_pred, ispk_evals = clu_lss.evaluate( # alg_option=Clusterer.alg_iterative_spherical_k_means, # param_init="k-means++") norm_hdbscan_pred, norm_hdbscan_evals = clu_lss.evaluate( alg_option=Clusterer.alg_h_dbscan) norm_ms_pred, norm_ms_evals = clu_lss.evaluate( alg_option=Clusterer.alg_mean_shift) # norm_xm_pred, norm_xm_evals = clu_lss.evaluate( # alg_option=Clusterer.alg_x_means) nhac_complete_pred, nhac_complete_evals = clu_lss.evaluate( alg_option=Clusterer.alg_hac, param_linkage="complete") nhac_s_pred, nhac_s_evals = clu_lss.evaluate( alg_option=Clusterer.alg_hac, param_linkage="single") nhac_a_pred, nhac_a_evals = clu_lss.evaluate( alg_option=Clusterer.alg_hac, param_linkage="average") n_optics_pred, n_optics_evals = clu_lss.evaluate( alg_option=Clusterer.alg_optics) # Baselines bl_rand_pred, bl_rand_evals = clu_lss.evaluate( alg_option=Clusterer.bl_random) bl_singleton_pred, bl_singleton_evals = clu_lss.evaluate( alg_option=Clusterer.bl_singleton) nhdp_pred, nhdp_evals = clu_lss.eval_cluster_hdp() ntrue_pred, ntrue_evals = clu_lss.eval_true_clustering() # SOTA - Gomez et. al. HAC and Log-Entropy with 20k features sota_pred_path = (r"D:\College\DKEM\Thesis\AuthorshipClustering\Code" r"\clusterPAN2017-master\train_out_LogEnt" f"\\problem{problem_nbr}\\clustering.json") sota_predicted = Tools.load_true_clusters_into_vector(sota_pred_path) sota_pred, sota_evals = clu_lss.eval_sota( sota_predicted=sota_predicted) # Return the results: return (Tools.form_problemset_result_dictionary( dictionaries=[ # ispk_evals, norm_spk_evals, norm_hdbscan_evals, norm_spk_evals, norm_hdbscan_evals, norm_ms_evals, # norm_xm_evals, nhac_complete_evals, nhac_s_evals, nhac_a_evals, n_optics_evals, bl_rand_evals, bl_singleton_evals, nhdp_evals, sota_evals, ntrue_evals ], identifiers=[ # "iSpKmeans", "E_SPKMeans", "E_HDBSCAN", "E_Mean_Shift", # "XMeans", "E_HAC_C", "E_HAC_Single", "E_HAC_Average", "E_OPTICS", "BL_r", "BL_s", "S_HDP", "BL_SOTA", "Labels"], problem_set=problem_set_id), ground_truth, lss_rep_docs, plain_docs, clu_lss) except FileNotFoundError: print("Please run HDP on these data first.")
import numpy as np import ast import operator from dateutil.parser import parse from keras.models import Sequential from keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPooling2D, BatchNormalization from keras import regularizers from features import LabelTrainer from clustering import Clusterer from dataset import read_dataset #Fifth approach dir = 'images' c = Clusterer() c.cluster() print('Loading datasets and pre-processing...') dataset, m = read_dataset() labelTrainer = LabelTrainer() labelTrainer.read_features() labelTrainer.build_model(encoding_dim=100) labelTrainer.train(epch=15) filenames = labelTrainer.filenames matrixFilenames = [] positivitymatrix = [] positivity = []