Пример #1
0
def main():
	
	#start Stanford NER
	p = subprocess.Popen("java -mx1000m -cp stanford-ner/stanford-ner.jar edu.stanford.nlp.ie.NERServer -loadClassifier stanford-ner/classifiers/english.all.3class.distsim.crf.ser.gz -port 1239", shell=True)
	#wait ten sec to make sure NER is up and running
	time.sleep(10)

	datafile = open('data/data.json', 'r')
	data = json.load(datafile)
	documents = []
	for article in data['articles']:
		document = Document(article)
		documents.append(document)

	distances.register_all_distances()
	d = distances.get_distance("Cosine")
	clus = Clusterer(documents, d, int(sys.argv[1]))
	clus.process_documents()
	print "Clustering finished ======================================================= \n"
	clus.print_all_topics()

	#kill NER
	os.kill(p.pid+1, 9)
    def _cluster_data(self, ps: int,
                      data: List[List],
                      ground_truth: List,
                      desired_k: int):
        clu_lss = Clusterer(dtm=data,
                            true_labels=ground_truth,
                            max_nbr_clusters=len(data)-1,
                            min_nbr_clusters=1,
                            min_cluster_size=2,
                            metric="cosine",
                            desired_n_clusters=desired_k)

        # Run SPKMeans 10 times to get mean performance
        # This is also what supplied the estimated k for the Clusterer
        # TODO: decouple k estimations from the evaluation
        norm_spk_pred, norm_spk_evals = clu_lss.evaluate(
                alg_option=Clusterer.alg_spherical_k_means,
                param_init="k-means++")

        cop_kmeans_pred, cop_kmeans_evals = clu_lss.evaluate(
            alg_option=Clusterer.alg_cop_kmeans,
            param_constraints_size=constraints_fraction,
            param_copkmeans_init="random")

        if include_older_algorithms:
            norm_hdbscan_pred, norm_hdbscan_evals = clu_lss.evaluate(
                    alg_option=Clusterer.alg_h_dbscan)

            norm_ms_pred, norm_ms_evals = clu_lss.evaluate(
                    alg_option=Clusterer.alg_mean_shift)

    #        norm_xm_pred, norm_xm_evals = clu_lss.evaluate(
    #                alg_option=Clusterer.alg_x_means)

            nhac_complete_pred, nhac_complete_evals = clu_lss.evaluate(
                    alg_option=Clusterer.alg_hac,
                    param_linkage="complete")

            nhac_s_pred, nhac_s_evals = clu_lss.evaluate(
                    alg_option=Clusterer.alg_hac,
                    param_linkage="single")

            nhac_a_pred, nhac_a_evals = clu_lss.evaluate(
                    alg_option=Clusterer.alg_hac,
                    param_linkage="average")

            n_optics_pred, n_optics_evals = clu_lss.evaluate(
                    alg_option=Clusterer.alg_optics)

        # Baselines
        bl_rand_pred, bl_rand_evals = clu_lss.evaluate(
                alg_option=Clusterer.bl_random)
        bl_singleton_pred, bl_singleton_evals = clu_lss.evaluate(
                alg_option=Clusterer.bl_singleton)

        nhdp_pred, nhdp_evals = clu_lss.eval_cluster_hdp()
        ntrue_pred, ntrue_evals = clu_lss.eval_true_clustering()

        # SOTA - Gomez et. al. HAC and Log-Entropy with 20k features
        # Not Applicable for Training data
        if not train_phase:
            sota_pred_path_le = (r"D:\College\DKEM\Thesis\AuthorshipClustering"
                                 r"\Code\clusterPAN2017-master\output_LogEnt"
                                 f"\\problem{ps:03d}\\clustering.json")
            sota_predicted_le = Tools.load_true_clusters_into_vector(
                    sota_pred_path_le)
            sota_pred_le, sota_evals_le = clu_lss.eval_sota(
                    sota_predicted=sota_predicted_le)

            sota_pred_path_tf = (r"D:\College\DKEM\Thesis\AuthorshipClustering"
                                 r"\Code\clusterPAN2017-master\output_Tf"
                                 f"\\problem{ps:03d}\\clustering.json")
            sota_predicted_tf = Tools.load_true_clusters_into_vector(
                    sota_pred_path_tf)
            sota_pred_tf, sota_evals_tf = clu_lss.eval_sota(
                    sota_predicted=sota_predicted_tf)

            sota_pred_path_tfidf = (
                r"D:\College\DKEM\Thesis\AuthorshipClustering"
                r"\Code\clusterPAN2017-master\output_TfIdf"
                f"\\problem{ps:03d}\\clustering.json")
            sota_predicted_tfidf = Tools.load_true_clusters_into_vector(
                    sota_pred_path_tfidf)
            sota_pred_tfidf, sota_evals_tfidf = clu_lss.eval_sota(
                    sota_predicted=sota_predicted_tfidf)
        else:
            # Build some placeholders only as SOTA isn't required to train
            # sota_pred_le = [0] * len(data)
            # sota_pred_tf = [0] * len(data)
            # sota_pred_tfidf = [0] * len(data)
            placebo_ret = {}
            placebo_ret.update({"nmi": None,
                                "ami": None,
                                "ari": None,
                                "fms": None,
                                "v_measure": None,
                                "bcubed_precision": None,
                                "bcubed_recall": None,
                                "bcubed_fscore": None,
                                "Silhouette": None,
                                "Calinski_harabasz": None,
                                "Davies_Bouldin": None
                                # Here goes the unsupervised indices
                                })
            sota_evals_le = placebo_ret
            sota_evals_tf = placebo_ret
            sota_evals_tfidf = placebo_ret

        # Control whether k is estimated or it is the true k replicated:
        if desired_k != 0:
            k_trend = clu_lss.cand_k
            k_trend.append(1 + max(clu_lss.true_labels))
        else:
            k_trend = [1 + max(clu_lss.true_labels)
                       ] * (nbr_competing_methods + 1)

        result = Tools.form_problemset_result_dictionary(
                dictionaries=[
                        # ispk_evals, norm_spk_evals, norm_hdbscan_evals,
                        norm_spk_evals, norm_hdbscan_evals,
                        norm_ms_evals,  # norm_xm_evals,
                        nhac_complete_evals, nhac_s_evals, nhac_a_evals,
                        n_optics_evals, cop_kmeans_evals,
                        bl_rand_evals, bl_singleton_evals,
                        nhdp_evals,
                        sota_evals_tf, sota_evals_tfidf, sota_evals_le,
                        ntrue_evals
                        ],
                identifiers=[  # "iSpKmeans",
                             "E_SPKMeans", "E_HDBSCAN",
                             "E_Mean_Shift",  # "XMeans",
                             "E_HAC_C", "E_HAC_Single", "E_HAC_Average",
                             "E_OPTICS", "E_COP_KMeans",
                             "BL_r", "BL_s", "S_HDP",
                             "BL_SOTA_tf", "BL_SOTA_tfidf", "BL_SOTA_le",
                             "Labels"],
                problem_set=ps)

        return result, k_trend
Пример #3
0
from clustering import Clusterer
from confo.home.models import *
from dblogging import LogOrCache
import sys

TOPN = 10
VECTOR_SIZE = 20
UNIQUE_WORDS_MAX = 2500

def get_table(mode):
    return LogOrCache(["fromauth","toauth","similarity"], "similar_authors.txt", mode, "similar_authors")

def clear_table():
    SimilarAuthors.objects.all().delete()    

if __name__ == '__main__':
    if len(sys.argv) != 2:
        print "argument: path_to_clustering_python_bin"
        sys.exit(-1)
    q = """SELECT pa.author_id, w.word, count(*) AS total
           from words as w, papers_authors as pa
           where pa.paper_id = w.pid
           group by pa.author_id, w.word
           order by pa.author_id, total DESC;"""
    c = Clusterer(get_table, clear_table, 'fromauth', 'toauth', q, TOPN, VECTOR_SIZE, UNIQUE_WORDS_MAX, sys.argv[1])
    c.build_similar()
def problem_set_run(problem_set_id: int,
                    n_clusters: int,
                    seed: int,
                    configuration: str,
                    drop_uncommon: bool,
                    verbose: bool,
                    infer_lss: bool = False):
    problem_nbr = f"{problem_set_id:03d}"
    # Define an LSS modeller to represent documents in LSS non-sparse space
    # HDP with Gibbs sampler is being used as is from:
    #   https://github.com/blei-lab/hdp

    # Adjust the parameters according to the preference
    if configuration == config_sparse:
        eta = 0.3
        gamma = 0.1
        alpha = 0.1
    elif configuration == config_dense:
        eta = 0.8
        gamma = 1.5
        alpha = 1.5
    else:
        eta = 0.5
        gamma = 1.0
        alpha = 1.0

    Modeller = LssHdpModeller(
            hdp_path=r"..\hdps\hdp",
            input_docs_path=r"..\..\Datasets\pan17_train\problem{}".format(
                    problem_nbr),
            ldac_filename=r"ldac_corpus",
            hdp_output_dir=r"hdp_lss",
            hdp_iters=10000,
            hdp_seed=seed,
            hdp_sample_hyper=False,
            hdp_eta=eta,
            hdp_gamma_s=gamma,
            hdp_alpha_s=alpha,
            word_grams=1,
            drop_uncommon=drop_uncommon,
            freq_threshold=1,
            verbose=verbose)

    # Infer the BoW and LSS representations of the documents
    try:
        # Load, project and visualise the data
        plain_docs, bow_rep_docs, lss_rep_docs = Modeller.get_corpus_lss(
                infer_lss,
                bim=False)

        # Begin Clustering Attempts
        true_labels_path = (r"..\..\Datasets\pan17_train\truth"
                            r"\problem{}\clustering.json"
                            ).format(problem_nbr)

        ground_truth = Tools.load_true_clusters_into_vector(true_labels_path)

        # Normalise the data if not BIM is used!
        clu_lss = Clusterer(dtm=Tools.normalise_data(data=lss_rep_docs),
                            true_labels=ground_truth,
                            max_nbr_clusters=len(lss_rep_docs)-1,
                            min_nbr_clusters=1,
                            min_cluster_size=2,
                            metric="cosine",
                            desired_n_clusters=n_clusters)

        norm_spk_pred, norm_spk_evals = clu_lss.evaluate(
                alg_option=Clusterer.alg_spherical_k_means,
                param_init="k-means++")

#        ispk_pred, ispk_evals = clu_lss.evaluate(
#                alg_option=Clusterer.alg_iterative_spherical_k_means,
#                param_init="k-means++")

        norm_hdbscan_pred, norm_hdbscan_evals = clu_lss.evaluate(
                alg_option=Clusterer.alg_h_dbscan)

        norm_ms_pred, norm_ms_evals = clu_lss.evaluate(
                alg_option=Clusterer.alg_mean_shift)

#        norm_xm_pred, norm_xm_evals = clu_lss.evaluate(
#                alg_option=Clusterer.alg_x_means)

        nhac_complete_pred, nhac_complete_evals = clu_lss.evaluate(
                alg_option=Clusterer.alg_hac,
                param_linkage="complete")

        nhac_s_pred, nhac_s_evals = clu_lss.evaluate(
                alg_option=Clusterer.alg_hac,
                param_linkage="single")

        nhac_a_pred, nhac_a_evals = clu_lss.evaluate(
                alg_option=Clusterer.alg_hac,
                param_linkage="average")

        n_optics_pred, n_optics_evals = clu_lss.evaluate(
                alg_option=Clusterer.alg_optics)

        # Baselines
        bl_rand_pred, bl_rand_evals = clu_lss.evaluate(
                alg_option=Clusterer.bl_random)
        bl_singleton_pred, bl_singleton_evals = clu_lss.evaluate(
                alg_option=Clusterer.bl_singleton)

        nhdp_pred, nhdp_evals = clu_lss.eval_cluster_hdp()
        ntrue_pred, ntrue_evals = clu_lss.eval_true_clustering()

        # SOTA - Gomez et. al. HAC and Log-Entropy with 20k features
        sota_pred_path = (r"D:\College\DKEM\Thesis\AuthorshipClustering\Code"
                          r"\clusterPAN2017-master\train_out_LogEnt"
                          f"\\problem{problem_nbr}\\clustering.json")
        sota_predicted = Tools.load_true_clusters_into_vector(sota_pred_path)
        sota_pred, sota_evals = clu_lss.eval_sota(
                sota_predicted=sota_predicted)

        # Return the results:
        return (Tools.form_problemset_result_dictionary(
                dictionaries=[
                        # ispk_evals, norm_spk_evals, norm_hdbscan_evals,
                        norm_spk_evals, norm_hdbscan_evals,
                        norm_ms_evals,  # norm_xm_evals,
                        nhac_complete_evals, nhac_s_evals, nhac_a_evals,
                        n_optics_evals, bl_rand_evals, bl_singleton_evals,
                        nhdp_evals, sota_evals, ntrue_evals
                        ],
                identifiers=[  # "iSpKmeans",
                             "E_SPKMeans", "E_HDBSCAN",
                             "E_Mean_Shift",  # "XMeans",
                             "E_HAC_C", "E_HAC_Single", "E_HAC_Average",
                             "E_OPTICS", "BL_r", "BL_s",
                             "S_HDP", "BL_SOTA", "Labels"],
                problem_set=problem_set_id),
                ground_truth,
                lss_rep_docs,
                plain_docs,
                clu_lss)

    except FileNotFoundError:
        print("Please run HDP on these data first.")
Пример #5
0
import numpy as np
import ast
import operator
from dateutil.parser import parse
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPooling2D, BatchNormalization
from keras import regularizers
from features import LabelTrainer
from clustering import Clusterer
from dataset import read_dataset

#Fifth approach

dir = 'images'

c = Clusterer()
c.cluster()

print('Loading datasets and pre-processing...')
dataset, m = read_dataset()

labelTrainer = LabelTrainer()
labelTrainer.read_features()
labelTrainer.build_model(encoding_dim=100)
labelTrainer.train(epch=15)

filenames = labelTrainer.filenames
matrixFilenames = []

positivitymatrix = []
positivity = []