def Evaluate_old(listtuple_pred_true_text, ignoreMinusOne=False): preds = [] trues = [] new_listtuple_pred_true_text = [] totalwords = 0 for pred_true_text in listtuple_pred_true_text: if str(pred_true_text[1]) == '-1' and ignoreMinusOne == True: continue preds.append(pred_true_text[0]) trues.append(pred_true_text[1]) new_listtuple_pred_true_text.append( [pred_true_text[0], pred_true_text[1], pred_true_text[2]]) totalwords += len(pred_true_text[2]) # print(pred_true_text[2], totalwords) print("evaluate total texts=" + str(len(new_listtuple_pred_true_text))) score = metrics.homogeneity_score(trues, preds) print("homogeneity_score-whole-data: %0.8f" % score) score = metrics.completeness_score(trues, preds) print("completeness_score-whole-data: %0.8f" % score) score = metrics.v_measure_score(trues, preds) print("v_measure_score-whole-data: %0.8f" % score) score = metrics.normalized_mutual_info_score(trues, preds, average_method='arithmetic') print("nmi_score-whole-data: %0.8f" % score) # score=metrics.adjusted_mutual_info_score(trues, preds) # print ("adjusted_mutual_info_score-whole-data: %0.4f" % score) # score=metrics.adjusted_rand_score(trues, preds) # print ("adjusted_rand_score-whole-data: %0.4f" % score) dic_tupple_class = groupItemsBySingleKeyIndex(new_listtuple_pred_true_text, 0) # before 0 dic_tupple_class_true = groupItemsBySingleKeyIndex( new_listtuple_pred_true_text, 1) # before 1 print("pred clusters=" + str(len(dic_tupple_class)) + ", true clusters=" + str(len(dic_tupple_class_true))) ComputePurity(dic_tupple_class) li = [ len(dic_tupple_class_true[x]) for x in dic_tupple_class_true if isinstance(dic_tupple_class_true[x], list) ] print('min', min(li), 'max', max(li), 'median', statistics.median(li), 'avg', statistics.mean(li), 'std', statistics.stdev(li), 'sum of li', sum(li)) print('avg words per text', totalwords / len(new_listtuple_pred_true_text), 'totalwords', totalwords, '#texts', len(new_listtuple_pred_true_text)) '''print("---Pred distribution")
def Evaluate(listtuple_pred_true_text): print("evaluate total texts=" + str(len(listtuple_pred_true_text))) preds = [] trues = [] for pred_true_text in listtuple_pred_true_text: preds.append(pred_true_text[0]) trues.append(pred_true_text[1]) score = metrics.homogeneity_score(trues, preds) print("homogeneity_score-whole-data: %0.8f" % score) score = metrics.completeness_score(trues, preds) print("completeness_score-whole-data: %0.8f" % score) #score=metrics.v_measure_score(trues, preds) #print ("v_measure_score-whole-data: %0.4f" % score) score = metrics.normalized_mutual_info_score(trues, preds, average_method='arithmetic') print("nmi_score-whole-data: %0.8f" % score) #score=metrics.adjusted_mutual_info_score(trues, preds) #print ("adjusted_mutual_info_score-whole-data: %0.4f" % score) #score=metrics.adjusted_rand_score(trues, preds) #print ("adjusted_rand_score-whole-data: %0.4f" % score) dic_tupple_class = groupItemsBySingleKeyIndex(listtuple_pred_true_text, 0) dic_tupple_class_true = groupItemsBySingleKeyIndex( listtuple_pred_true_text, 1) print("pred clusters=" + str(len(dic_tupple_class)) + ", true clusters=" + str(len(dic_tupple_class_true))) ComputePurity(dic_tupple_class) '''print("---Pred distribution")
def ComputePurity(dic_tupple_class, groupByIndex=1): totalItems = 0 maxGroupSizeSum = 0 for label, pred_true_txts in dic_tupple_class.items(): totalItems = totalItems + len(pred_true_txts) # print("pred label="+label+", #texts="+str(len(pred_true_txts))) dic_tupple_class_originalLabel = groupItemsBySingleKeyIndex( pred_true_txts, groupByIndex) maxMemInGroupSize = -1000000 maxMemOriginalLabel = "" for orgLabel, org_pred_true_txts in dic_tupple_class_originalLabel.items( ): # print("orgLabel label="+orgLabel+", #texts="+str(len(org_pred_true_txts))) if maxMemInGroupSize < len(org_pred_true_txts): maxMemInGroupSize = len(org_pred_true_txts) maxMemOriginalLabel = orgLabel # print("\n") # print(str(label)+" purity="+str(maxMemInGroupSize/len(pred_true_txts))+", items="+str(len(pred_true_txts))+", max match#="+str(maxMemInGroupSize)) # print_by_group(pred_true_txts) maxGroupSizeSum = maxGroupSizeSum + maxMemInGroupSize purity = maxGroupSizeSum / float(totalItems) print("acc majority whole data=" + str(purity)) return purity
def populateClusterReps(all_global, wordVectorsDic, embedDim): dic_cluster_rep_words = {} dic_cluster_rep_vec = {} dic_tupple_class = groupItemsBySingleKeyIndex(all_global, 0) for predKey, items in dic_tupple_class.items(): clus_words = [] #can filter some words using word entropy based on clus distributions. for item in items: words = item[2] clus_words.extend(words) dic_word_counts = Counter(clus_words) wordCounts = dic_word_counts.values() mean = 0 if len(wordCounts) >= 1: mean = statistics.mean(wordCounts) std = mean if len(wordCounts) >= 2: std = statistics.stdev(wordCounts) dic_word_counts_filtered = {} for key, counts in dic_word_counts.items(): if counts > mean + std: dic_word_counts_filtered[key] = counts if len(dic_word_counts_filtered) <= 2: dic_word_counts_filtered = {} for key, counts in dic_word_counts.items(): if counts > 1: dic_word_counts_filtered[key] = counts #if len(dic_word_counts_filtered)<=2: # dic_word_counts_filtered={} # for key, counts in dic_word_counts.items(): # dic_word_counts_filtered[key]=counts clus_words = list(dic_word_counts_filtered.keys()) clus_word_counts = list(dic_word_counts_filtered.values()) cent_Vec_words = generate_sent_vecs_toktextdata([clus_words], wordVectorsDic, embedDim)[0] dic_cluster_rep_words[predKey] = [ dic_word_counts_filtered, sum(clus_word_counts) ] dic_cluster_rep_vec[predKey] = cent_Vec_words #print(dic_cluster_rep_words[predKey]) #print(dic_cluster_rep_vec[predKey]) return [dic_cluster_rep_words, dic_cluster_rep_vec]
def RenameTrueLabel(pred_true_texts,startTrueSeed, startIdSeed): lastTrueLabel=startTrueSeed lastId=startIdSeed renamed_pred_true_texts=[] dic_group = groupItemsBySingleKeyIndex(pred_true_texts,1) groupsLen=len(dic_group) #print(groupsLen) for trueLabel, items_pred_true_text in dic_group.items(): lastTrueLabel=lastTrueLabel+1 for item_pred_true_text in items_pred_true_text: lastId=lastId+1 renamed_pred_true_texts.append([str(lastId).zfill(6), str(lastTrueLabel), item_pred_true_text[2]]) return [lastTrueLabel, lastId, renamed_pred_true_texts]
def clusterByWordEmbeddingIntelligent(list_pred_true_text_ind_prevind, wordVectorsDic): print("pred_mstreams") printClusterEvaluation_list(list_pred_true_text_ind_prevind) dic_itemGroups = groupItemsBySingleKeyIndex( list_pred_true_text_ind_prevind, 0) pred_clusters = int(len(dic_itemGroups) / 1.0) #needs to be determined carefully dic_group_sizes = [ len(dic_itemGroups[x]) for x in dic_itemGroups if isinstance(dic_itemGroups[x], list) ] print(dic_group_sizes) print("#clusters=" + str(pred_clusters)) nparr = np.array(list_pred_true_text_ind_prevind) preds = list(nparr[:, 0]) trues = list(nparr[:, 1]) word_arr = list(nparr[:, 2]) inds = list(nparr[:, 3]) X = generate_sent_vecs_toktextdata(word_arr, wordVectorsDic, 300) #X=generate_sent_vecs_toktextdata_autoencoder(word_arr, wordVectorsDic, 300, pred_clusters) svd = TruncatedSVD(50) #svd = PCA(n_components=50) normalizer = Normalizer(copy=False) lsa = make_pipeline(svd, normalizer) #X=X.toarray() X = lsa.fit_transform(X) ward = AgglomerativeClustering(n_clusters=pred_clusters, linkage='ward').fit(X) list_hr_pred_true_text = combine_pred_true_txt_from_list( ward.labels_, trues, word_arr) print("hr-ward") printClusterEvaluation_list(list_hr_pred_true_text) clustering = SpectralClustering(n_clusters=pred_clusters, assign_labels="discretize", random_state=0).fit(X) list_sp_pred_true_text = combine_pred_true_txt_from_list( clustering.labels_, trues, word_arr) print("spectral") printClusterEvaluation_list(list_sp_pred_true_text)
def Evaluate_old(listtuple_pred_true_text, ignoreMinusOne=False): preds = [] trues = [] new_listtuple_pred_true_text = [] totalwords = 0 for pred_true_text in listtuple_pred_true_text: if str(pred_true_text[1]) == '-1' and ignoreMinusOne: continue preds.append(pred_true_text[0]) trues.append(pred_true_text[1]) new_listtuple_pred_true_text.append( [pred_true_text[0], pred_true_text[1], pred_true_text[2]]) totalwords += len(pred_true_text[2]) # print(pred_true_text[2], totalwords) print("evaluate total texts=" + str(len(new_listtuple_pred_true_text))) score = metrics.homogeneity_score(trues, preds) print("homogeneity_score-whole-data: %0.8f" % score) score = metrics.completeness_score(trues, preds) print("completeness_score-whole-data: %0.8f" % score) score = metrics.v_measure_score(trues, preds) print("v_measure_score-whole-data: %0.8f" % score) score = metrics.normalized_mutual_info_score(trues, preds, average_method='arithmetic') print("nmi_score-whole-data: %0.8f" % score) # score=metrics.adjusted_mutual_info_score(trues, preds) # print ("adjusted_mutual_info_score-whole-data: %0.4f" % score) # score=metrics.adjusted_rand_score(trues, preds) # print ("adjusted_rand_score-whole-data: %0.4f" % score) dic_tupple_class = groupItemsBySingleKeyIndex(new_listtuple_pred_true_text, 0) # before 0 dic_tupple_class_true = groupItemsBySingleKeyIndex( new_listtuple_pred_true_text, 1) # before 1 print("pred clusters=" + str(len(dic_tupple_class)) + ", true clusters=" + str(len(dic_tupple_class_true))) ComputePurity(dic_tupple_class) print('wrong:avg words per text', totalwords / len(new_listtuple_pred_true_text), 'totalwords', totalwords, '#texts', len(new_listtuple_pred_true_text)) keysByLength = sorted(dic_tupple_class, key=lambda key: len(dic_tupple_class[key]), reverse=True) li = [] for key in keysByLength: print('clusterid=', key, '#items', len(dic_tupple_class[key])) li.append(len(dic_tupple_class[key])) print('2nd:min', min(li), 'max', max(li), 'median', statistics.median(li), 'avg', statistics.mean(li), 'std', statistics.stdev(li), 'sum of li', sum(li))
def print_by_group(listtuple_pred_true_text, grIndex): dic_tupple_class=groupItemsBySingleKeyIndex(listtuple_pred_true_text, grIndex) for label, pred_true_txts in sorted(dic_tupple_class.items()): Print_list_pred_true_text(pred_true_txts) print("total groups=", len(dic_tupple_class))
#from general_util import Print_list_pred_true_text gloveFile = "/home/owner/PhD/dr.norbert/dataset/shorttext/glove.42B.300d/glove.42B.300d.txt" listtuple_pred_true_text = ReadPredTrueText("result/batchId_PredTrueText1") newList = [] i = -1 for pred_true_text in listtuple_pred_true_text: i = i + 1 newList.append(pred_true_text + [i, i]) listtuple_pred_true_text = newList listtuple_pred_true_text = RemoveHighClusterEntropyWordsIndex( listtuple_pred_true_text) dic_tupple_class = groupItemsBySingleKeyIndex(listtuple_pred_true_text, 0) #wordVectorsDic = extractAllWordVecs(gloveFile, 300) for label, cluster_pred_true_txt_inds in dic_tupple_class.items(): _components, newPred_OldPred_true_text_inds = clusterByConnectedComponentIndex( cluster_pred_true_txt_inds) #print(_components, newPred_OldPred_true_text_inds) dic_new_tupple_class = groupItemsBySingleKeyIndex( newPred_OldPred_true_text_inds, 0) for newLabel, cluster_newPred_OldPred_true_text_inds in dic_new_tupple_class.items( ): print("newLabel", newLabel) #print_by_group(cluster_newPred_OldPred_true_text_inds)
break # if h_count > max_hit: # break # if h_count > max_hit: # break if cluscount > 1000: break if not found: print('not\t' + str(h_count) + '\t' + str(test_oCPost.soPostId) + '\t' + str(test_oCPost.tagWords)+'\t'+str(test_oCPost.trueLabel)) listtuple_pred_true_text = ReadPredTrueText(clusterWriterFile) dic_tupple_class = groupItemsBySingleKeyIndex(listtuple_pred_true_text, 0) # before 0 # print(dic_tupple_class) dic_term_clusterIds, dic_cluster_ftrs, dic_cluster_size = createTermToClsuetrId(dic_tupple_class) #############test test_list_CPost = readStackOverflowDataSetTagTitleBody(testFile) # print(test_list_CPost) for oCPost in test_list_CPost: terms = oCPost.tagWords test_term_dict = Counter(terms) test_term_size = len(terms)