def multi_evaluation(method, filename, view_names, weights, norm = 'l2', seed="random", post = "direct", data_size=-1): """ Eval of baselines (k-means, nmf, svd) on the combined view of all views for multi-view clustering. :param: method (type: string). Supported methods include "k-means", "nmf", "svd" :param: filename (type: string). The path of the input multi-view data (.MAT format). :param: view_names (type: list<string>). View names of the input multi-view data. :param: weights (type: list<int>). The weight of each view to build the combined view. :param: norm (type: string). Normalization strategy on the input data. Supported norm methods include: 'l2': each item vector is normalized by its Euclidean length (i.e. l2 distance). 'l1': each item vector is normalized by its sum of all elements (i.e. l1 distance). 'l0': the whole matrix is normalized by the sum of all elements (i.e. l1 normalization on the whole matrix). :param seed (type: string). The initialization method in CoNMF. Values can be 'k-means' and 'random': 'k-means': initialize W and H matrix using k-means results. The details are seen in paper [1] Section 4.5 'random': randomly initialize W and H matrix. :param post (type: string). Post processing on W matrix (m*k) to generate clustering result. Values can be 'direct' and 'k-means': 'direct': for each item vector (m*1), use the element with largest value as its cluster assignment. 'k-means': perform k-means on W matrix to get cluster assignment. :param data_size (type: int). Select the first data_size items to run clustering algorithm. When the value is -1, the clustering algorithm is run on all items. This parameter is for a quick check of clustering algorithms in case the input data is too large. """ method = method.lower() if method not in ["k-means","kmeans","nmf","svd"]: print "Error! Wrong input method name." return None # weights can only be integers if len(view_names)!= len(weights): print "Error! Length of view_names is not equal to the length of weights!" return None datas, names, groundtruth, cluster_k = dl.loadMATdata(filename, view_names,data_size) data = dl.combineData(datas, weights, norm) NMIs,As,F1s = [],[],[] print "Running multi- %s(k=%d,norm=%s,seed=%s,post=%s) for %s, size = %s, #runs: %d" %(method,cluster_k,norm,seed,post,names,str(data.shape), n_runs) print "view_names = %s, weights = %s" %(str(names),str(weights)) i_run = 1 t0 = time() while i_run <= n_runs: t1 = time() if method == "kmeans" or method=="k-means": labels = kmeans(data, cluster_k, norm)[0] if method == "nmf": labels = nmf(data, cluster_k, norm, seed, post, groundtruth)[0] if method == "svd": labels = svd(data, cluster_k, norm, post) NMI,A,F1,P,RI,ARI = evaluation_scores(groundtruth,labels) print "\t %d-th run(time=%ds),<Acc, F1, NMI>\t%f\t%f\t%f" %(i_run,time()-t1,A,F1,NMI) NMIs.append(NMI) As.append(A) F1s.append(F1) i_run = i_run+1 print "Results of %d runs (mean,std_var):\n\t Acc: %f, %f\n\t F1 : %f, %f\n\t NMI: %f, %f" %(n_runs, np.mean(As),np.std(As),np.mean(F1s),np.std(F1s),np.mean(NMIs),np.std(NMIs)) print "Running time: %fs" %(time() - t0)
def conmf_evaluation(regu_method, filename, view_names, weights, regu_weights, norm = 'l2', seed = "random", post = "direct", data_size = -1): """ Eval of CoNMF for multi-view clustering. :param: regu_method (type: string). Supported methods include "pair-wise", "cluster-wise" :param: filename (type: string). The path of the input multi-view data (.MAT format). :param: view_names (type: list<string>). View names of the input multi-view data. :param weights (type: list<int>). [weight_s]. Each element weight_s is an integer, denoting the weight of the view in CoNMF factorization (i.e. \lambda_s in paper [1]) :param regular_weights (type: list, 2-dimension). [[weight_st]] Each element weight_st is an integer, denoting the weight of view pair <s,t> in CoNMF regularization (i.e. \lambda_st in paper [1]) :param norm (type: string). Normalization scheme in CoNMF initialization and factorization. Values can be 'l2', 'l1' and 'l0': 'l2': each item vector is normalized by its Euclidean length (i.e. l2 distance). 'l1': each item vector is normalized by its sum of all elements (i.e. l1 distance). 'l0': the whole matrix is normalized by the sum of all elements (i.e. l1 normalization on the whole matrix). :param seed (type: string). Initialization method in CoNMF. Values can be 'k-means' and 'random': 'k-means': initialize W and H matrix using k-means results. The details are seen in paper [1] Section 4.5 'random': randomly initialize W and H matrix. :param post (type: string). Post processing on W matrix (m*k) to generate clustering result. Values can be 'direct' and 'k-means': 'direct': for each item vector (m*1), use the element with largest value as its cluster assignment. 'k-means': perform k-means on W matrix to get cluster assignment. :param data_size (type: int). Select the first data_size items to run clustering algorithm. When the value is -1, the clustering algorithm is run on all items. This parameter is for a quick check of clustering algorithms in case the input data is too large. """ datas, names, groundtruth, cluster_k = dl.loadMATdata(filename, view_names, data_size) print "Running CoNMF-%s (k=%d,norm=%s,seed=%s,post=%s,weights=%s,regularization_weights=%s)"%(regu_method,cluster_k,norm,seed,post, str(weights),str(regu_weights)) i_run = 1 t0 = time() NMIs,As,F1s = [],[],[] while i_run<= n_runs: print "\t %d-th run starts. Initializing using %s ..." %(i_run, seed) t1 = time() results = conmf(datas, cluster_k, weights, regu_weights, norm, seed, post,regu_method, groundtruth) labels,Ws,Hs = results[0],results[1],results[2] NMI,A,F1,P,RI,ARI,map_pairs = evaluation_scores(groundtruth,labels) print "\t %d-th run ends (time=%ds). <Acc, F1, NMI>(last view)\t%f\t%f\t%f" %(i_run,time()-t1,A,F1,NMI) NMIs.append(NMI) As.append(A) F1s.append(F1) i_run = i_run+1 print "Results (on the last view) of %d runs (mean,std_var):\n\t Acc: %f, %f\n\t F1 : %f, %f\n\t NMI: %f, %f" %(n_runs, np.mean(As),np.std(As),np.mean(F1s),np.std(F1s),np.mean(NMIs),np.std(NMIs)) print "Running time: %fs" %(time() - t0)