def DetectNonOutliers(listtuple_pred_true_text): printClusterEvaluation_list(listtuple_pred_true_text) dic_tupple_class = groupTxtByClass(listtuple_pred_true_text, False) print("true clusters=" + str(len(groupTxtByClass(listtuple_pred_true_text, True)))) #ComputePurity(dic_tupple_class) totalItems = 0 itemsInClusterList = [] for label, pred_true_txts in dic_tupple_class.items(): itemsInCluster = len(pred_true_txts) #print("itemsInCluster="+str(itemsInCluster)) totalItems = totalItems + itemsInCluster itemsInClusterList.append(itemsInCluster) totalClusters = len(dic_tupple_class) avgItemsInCluster_in_a_batch = float(totalItems) / totalClusters std = np.std(itemsInClusterList) print("totalItems=" + str(totalItems) + ",avgItemsInCluster_in_a_batch=" + str(avgItemsInCluster_in_a_batch) + ",std=" + str(std)) non_outlier_pred_true_txts_in_all_clusters, outlier_pred_true_txts_in_all_clusters = DetectNonOutliersByThreshold( dic_tupple_class, avgItemsInCluster_in_a_batch) print("total #outliers=" + str(len(outlier_pred_true_txts_in_all_clusters))) #print("#non_outlier_pred_true_txts_in_all_clusters#") #print(non_outlier_pred_true_txts_in_all_clusters) #print("#outlier_pred_true_txts_in_all_clusters#") #print(outlier_pred_true_txts_in_all_clusters) #print("--Batch End--") return [ non_outlier_pred_true_txts_in_all_clusters, outlier_pred_true_txts_in_all_clusters, avgItemsInCluster_in_a_batch ]
def DetectNonOutliersByThreshold(dic_tupple_class, avgItemsInCluster_in_a_batch): non_outlier_pred_true_txts_in_all_clusters=[] outlier_pred_true_txts_in_all_clusters=[] for label, pred_true_txts in dic_tupple_class.items(): itemsInCluster=len(pred_true_txts) if itemsInCluster>avgItemsInCluster_in_a_batch: #print("cluster label="+str(label)+", "+str(itemsInCluster)) textsArr=[] for pred_true_txt in pred_true_txts: textsArr.append(pred_true_txt[2]) vectorizer = TfidfVectorizer( max_df=1.0, min_df=1, stop_words='english', use_idf=True, smooth_idf=True, norm='l2') x_train = vectorizer.fit_transform(textsArr) contratio = 0.3 isf = IsolationForest(n_estimators=100, max_samples='auto', contamination=contratio, max_features=1.0, bootstrap=True, verbose=0, random_state=0, behaviour='new') #isf=IsolationForest(n_estimators=100, max_samples='auto', contamination=contratio, max_features=1.0, bootstrap=True, verbose=0, random_state=0) outlierPreds = isf.fit(x_train).predict(x_train) non_outlier_pred_true_txts_in_a_cluster=[] for i in range(len(outlierPreds)): outlierPred=outlierPreds[i] if outlierPred !=-1: non_outlier_pred_true_txts_in_a_cluster.append(pred_true_txts[i]) non_outlier_pred_true_txts_in_all_clusters.append(pred_true_txts[i]) else: outlier_pred_true_txts_in_all_clusters.append(pred_true_txts[i]) else: non_outlier_pred_true_txts_in_all_clusters.extend(pred_true_txts) dic_tupple_class_filteres=groupTxtByClass(non_outlier_pred_true_txts_in_all_clusters, False) printClusterEvaluation_list(non_outlier_pred_true_txts_in_all_clusters) print ("true clusters="+str(len(groupTxtByClass(non_outlier_pred_true_txts_in_all_clusters, True)))) #ComputePurity(dic_tupple_class_filteres) return [non_outlier_pred_true_txts_in_all_clusters, outlier_pred_true_txts_in_all_clusters]
def GenerateTrainTest2_Percentage(percentTrainData): trainDataRatio = 1.0 listtuple_pred_true_text = ReadPredTrueText(traintestFile) perct_tdata = percentTrainData / 100 goodAmount_txts = int(perct_tdata * (len(listtuple_pred_true_text) / numberOfClusters)) dic_tupple_class = groupTxtByClass(listtuple_pred_true_text, False) #write texts of each group in WriteTextsOfEachGroup(textsperlabelDir, dic_tupple_class) dic_label_outliers = Gen_WriteOutliersEachGroup(textsperlabelDir, numberOfClusters) train_pred_true_txts = [] test_pred_true_txts = [] for label, pred_true_txt in dic_tupple_class.items(): outlierpreds = dic_label_outliers[str(label)] pred_true_txts = dic_tupple_class[str(label)] if len(outlierpreds) != len(pred_true_txts): print("Size not match for=" + str(label)) outLiers_pred_true_txt = [] count = -1 for outPred in outlierpreds: outPred = str(outPred) count = count + 1 if outPred == "-1": outLiers_pred_true_txt.append(pred_true_txts[count]) test_pred_true_txts.extend(outLiers_pred_true_txt) #remove outlierts insts from pred_true_txts pred_true_txts_good = [ e for e in pred_true_txts if e not in outLiers_pred_true_txt ] dic_tupple_class[str(label)] = pred_true_txts_good for label, pred_true_txt in dic_tupple_class.items(): pred_true_txts = dic_tupple_class[str(label)] pred_true_txt_subs = [] numTrainGoodTexts = int(perct_tdata * len(pred_true_txts)) if len(pred_true_txts) > goodAmount_txts: pred_true_txt_subs.extend(pred_true_txts[0:goodAmount_txts]) test_pred_true_txts.extend( pred_true_txts[goodAmount_txts:len(pred_true_txts)]) else: pred_true_txt_subs.extend(pred_true_txts) train_pred_true_txts.extend(pred_true_txt_subs) trainDataRatio = len(train_pred_true_txts) / len(train_pred_true_txts + test_pred_true_txts) #print("trainDataRatio="+str(trainDataRatio)) if trainDataRatio <= maxTrainRatio: WriteTrainTestInstances(trainFile, train_pred_true_txts) WriteTrainTestInstances(testFile, test_pred_true_txts) return trainDataRatio
def generateTrainTestTxtsByOutliers(dic_tuple_class, dic_list_outliers_class1, maxItemsInEachClass): trainTup_pred_true_txt = [] testTup_pred_true_txt = [] for key, value in dic_tuple_class.items(): if key not in dic_list_outliers_class1 or len(value) != len( dic_list_outliers_class1[key]): print("miss match=" + key) continue rest_traintupTPTxt = [] outliers = dic_list_outliers_class1[key] print("collections.Counter=" + str(collections.Counter(outliers)) + ", key=" + key + ", len(outliers)=" + str(len(outliers)) + ", len(value)=" + str(len(value))) count = -1 for tup_pred_true_text in value: count = count + 1 #print("outliers[count]="+str(outliers[count])) if outliers[count] == -1: testTup_pred_true_txt.append(tup_pred_true_text) else: rest_traintupTPTxt.append(tup_pred_true_text) print("len(rest_traintupTPTxt)=" + str(len(rest_traintupTPTxt)) + ",maxItemsInEachClass=" + str(maxItemsInEachClass)) if len(rest_traintupTPTxt) > maxItemsInEachClass: trainTup_pred_true_txt.extend( rest_traintupTPTxt[0:maxItemsInEachClass]) testTup_pred_true_txt.extend( rest_traintupTPTxt[maxItemsInEachClass:len(rest_traintupTPTxt )]) else: trainTup_pred_true_txt.extend(rest_traintupTPTxt) print("after remove outlier, max items=" + str(maxItemsInEachClass) + ", total=" + str(len(trainTup_pred_true_txt + testTup_pred_true_txt))) groupTxtByClass(trainTup_pred_true_txt + testTup_pred_true_txt, False) return [trainTup_pred_true_txt, testTup_pred_true_txt]
def Evaluate(listtuple_pred_true_text): print("evaluate total texts=" + str(len(listtuple_pred_true_text))) preds = [] trues = [] for pred_true_text in listtuple_pred_true_text: preds.append(pred_true_text[0]) trues.append(pred_true_text[1]) score = metrics.homogeneity_score(trues, preds) print("homogeneity_score-whole-data: %0.8f" % score) score = metrics.completeness_score(trues, preds) print("completeness_score-whole-data: %0.8f" % score) score = metrics.v_measure_score(trues, preds) print("v_measure_score-whole-data: %0.4f" % score) nmi_score = metrics.normalized_mutual_info_score( trues, preds, average_method='arithmetic') print("nmi_score-whole-data: %0.8f" % nmi_score) # score=metrics.adjusted_mutual_info_score(trues, preds) # print ("adjusted_mutual_info_score-whole-data: %0.4f" % score) # score=metrics.adjusted_rand_score(trues, preds) # print ("adjusted_rand_score-whole-data: %0.4f" % score) dic_tupple_class = groupTxtByClass(listtuple_pred_true_text, False) dic_tupple_class_true = groupTxtByClass(listtuple_pred_true_text, True) print("pred clusters=" + str(len(dic_tupple_class)) + ", true clusters=" + str(len(dic_tupple_class_true))) purity = ComputePurity(dic_tupple_class) '''print("---Pred distribution") for key,value in dic_tupple_class.items(): print(key, len(value)) print("---True distribution") for key,value in dic_tupple_class_true.items(): print(key, len(value))''' return [purity, nmi_score]
def EvaluateByPurity(traintestFile): listtuple_pred_true_text = ReadPredTrueText(traintestFile) preds = [] trues = [] for pred_true_text in listtuple_pred_true_text: preds.append(pred_true_text[0]) trues.append(pred_true_text[1]) score = metrics.homogeneity_score(trues, preds) print("purity_score-whole-data: %0.4f" % score) score = metrics.normalized_mutual_info_score(trues, preds) print("nmi_score-whole-data: %0.4f" % score) dic_tupple_class = groupTxtByClass(listtuple_pred_true_text, False) ComputePurity(dic_tupple_class)
def removeOutlierConnectedComponentLexicalByItem(listtuple, batchDocs, maxPredLabel): outliers = [] non_outliers = [] avgItemsInCluster = 0 dic_tupple_class = groupTxtByClass(listtuple, False) for label, items in dic_tupple_class.items(): #_components,newPred_items=clusterByConnectedComponentByItem(items) print("to do") return [outliers, non_outliers, avgItemsInCluster, maxPredLabel]
def printClusterEvaluation_list(listtuple_pred_true_text): preds = [] trues = [] for pred_true_text in listtuple_pred_true_text: preds.append(pred_true_text[0]) trues.append(pred_true_text[1]) score = metrics.homogeneity_score(trues, preds) print("homogeneity_score-whole-data: %0.4f" % score) score = metrics.normalized_mutual_info_score(trues, preds, average_method='arithmetic') print("nmi_score-whole-data: %0.4f" % score) dic_tupple_class = groupTxtByClass(listtuple_pred_true_text, False) ComputePurity(dic_tupple_class)
def EvaluateByPurity(traintestFile): listtuple_pred_true_text = ReadPredTrueText(traintestFile) preds = [] trues = [] for pred_true_text in listtuple_pred_true_text: preds.append(pred_true_text[0]) trues.append(pred_true_text[1]) #score = metrics.homogeneity_score(trues, preds) #print ("homogeneity_score-whole-data: %0.4f" % score) score = metrics.normalized_mutual_info_score(trues, preds, average_method='arithmetic') #print ("nmi_score-whole-data: %0.6f" % score) dic_tupple_class = groupTxtByClass(listtuple_pred_true_text, False) acc = ComputePurity(dic_tupple_class) print("acc", acc, "nmi", score)
def comPrehensive_GenerateTrainTestTxtsByOutliersTfIDf_varoutlier( listtuple_pred_true_text, maxItemsInEachClass, avgItemPercluster): trainTup_pred_true_txt = [] testTup_pred_true_txt = [] dic_tuple_class = groupTxtByClass(listtuple_pred_true_text, False) dic_list_outliers_class = {} for key, value in dic_tuple_class.items(): txt_datas = [] for tup_pred_true_text in value: txt_datas.append(tup_pred_true_text[2]) outlierratio = len(value) / avgItemPercluster * 0.3 print("outlierratio=" + str(outlierratio)) if outlierratio > 0.4: outlierratio = 0.4 contratio = outlierratio #0.1 print(len(txt_datas)) vectorizer = TfidfVectorizer(max_df=1.0, min_df=1, stop_words='english', use_idf=True, smooth_idf=True, norm='l2') x_train = vectorizer.fit_transform(txt_datas) isf = IsolationForest(n_estimators=100, max_samples='auto', contamination=contratio, max_features=1.0, bootstrap=True, verbose=0, random_state=0) outlierPreds = isf.fit(x_train).predict(x_train) print(len(outlierPreds)) dic_list_outliers_class[key] = outlierPreds trainTup_pred_true_txt, testTup_pred_true_txt = generateTrainTestTxtsByOutliers( dic_tuple_class, dic_list_outliers_class, maxItemsInEachClass) print("#trainTup_pred_true_txt=" + str(len(trainTup_pred_true_txt)) + ", #testTup_pred_true_txt=" + str(len(testTup_pred_true_txt))) return [trainTup_pred_true_txt, testTup_pred_true_txt]
def EvaluateByPurity(traintestFile): listtuple_pred_true_text = ReadPredTrueText(traintestFile) preds = [] trues = [] for pred_true_text in listtuple_pred_true_text: preds.append(pred_true_text[0]) trues.append(pred_true_text[1]) score = metrics.homogeneity_score(trues, preds) last_homoginity=score print ("homogeneity_score-whole-data: %0.4f" % score) #score = metrics.normalized_mutual_info_score(trues, preds, average_method='arithmetic') score = metrics.normalized_mutual_info_score(trues, preds) last_nmi=score print ("nmi_score-whole-data: %0.4f" % score) last_completeness=metrics.completeness_score(trues, preds) print("completeness="+str(last_completeness)) dic_tupple_class=groupTxtByClass(listtuple_pred_true_text, False) return ComputePurity(dic_tupple_class)
def outlierBySmallestGroups(newPred_OldPred_true_texts): outliersInCluster = [] non_outliersInCluster = [] rows = len(newPred_OldPred_true_texts) #np.concatenate((A[:,0].reshape(2,1), A[:,2:4]),axis=1) np_arr = np.array(newPred_OldPred_true_texts) newPred_true_texts = np.concatenate( (np_arr[:, 0].reshape(rows, 1), np_arr[:, 2:4]), axis=1).tolist() dic_tupple_class = groupTxtByClass(newPred_true_texts, False) maxGroupSize = -10 newPredMaxLabel = "" totalGroups = len(dic_tupple_class) #print("totalGroups by connComp="+str(totalGroups)) for label, pred_true_txts in dic_tupple_class.items(): groupSize = len(pred_true_txts) if maxGroupSize < groupSize: newPredMaxLabel = label maxGroupSize = groupSize for newPred_OldPred_true_text in newPred_OldPred_true_texts: newPredLabel = newPred_OldPred_true_text[0] oldPredLabel = newPred_OldPred_true_text[1] trueLabel = newPred_OldPred_true_text[2] text = newPred_OldPred_true_text[3] OldPred_true_text = [oldPredLabel, trueLabel, text] if str(newPredLabel) == str(newPredMaxLabel): non_outliersInCluster.append(OldPred_true_text) else: outliersInCluster.append(OldPred_true_text) '''if totalGroups>1: #split the group into outlier/non-outliers" non_outliersInCluster=dic_tupple_class[newPredMaxLabel] for label, pred_true_txts in dic_tupple_class.items(): if label!=newPredMaxLabel: outliersInCluster.extend(pred_true_txts) else: non_outliersInCluster=newPred_OldPred_true_texts''' return [outliersInCluster, non_outliersInCluster]
def ComputePurity(dic_tupple_class): totalItems = 0 maxGroupSizeSum = 0 for label, pred_true_txts in dic_tupple_class.items(): totalItems = totalItems + len(pred_true_txts) dic_tupple_class_originalLabel = groupTxtByClass(pred_true_txts, True) maxMemInGroupSize = minIntVal maxMemOriginalLabel = "" for orgLabel, org_pred_true_txts in dic_tupple_class_originalLabel.items( ): if maxMemInGroupSize < len(org_pred_true_txts): maxMemInGroupSize = len(org_pred_true_txts) maxMemOriginalLabel = orgLabel maxGroupSizeSum = maxGroupSizeSum + maxMemInGroupSize acc = maxGroupSizeSum / totalItems #print("acc whole data="+str(acc)) return acc
def ComputePurity(dic_tupple_class): totalItems=0 maxGroupSizeSum =0 for label, pred_true_txts in dic_tupple_class.items(): totalItems=totalItems+len(pred_true_txts) dic_tupple_class_originalLabel=groupTxtByClass(pred_true_txts, True) maxMemInGroupSize=minIntVal maxMemOriginalLabel="" for orgLabel, org_pred_true_txts in dic_tupple_class_originalLabel.items(): if maxMemInGroupSize < len(org_pred_true_txts): maxMemInGroupSize=len(org_pred_true_txts) maxMemOriginalLabel=orgLabel maxGroupSizeSum=maxGroupSizeSum+maxMemInGroupSize purity=maxGroupSizeSum/totalItems print("purity majority whole data="+str(purity)) last_purity_custom=purity return purity
def extrcatLargeClusterItems(listtuple): dic_tupple_class = groupTxtByClass(listtuple, False) itemCounts = [] items_to_cluster = [] items_to_not_cluster = [] for label, tuples in dic_tupple_class.items(): #if len(tuples)<3: # continue itemCounts.append(len(tuples)) std = statistics.stdev(itemCounts) mean = statistics.stdev(itemCounts) for label, tuples in dic_tupple_class.items(): no_items = len(tuples) if no_items >= mean + 1.2 * std: items_to_cluster.extend(tuples) else: items_to_not_cluster.extend(tuples) return [items_to_cluster, items_to_not_cluster]
def ComputePurity(dic_tupple_class): totalItems = 0 maxGroupSizeSum = 0 for label, pred_true_txts in dic_tupple_class.items(): totalItems = totalItems + len(pred_true_txts) # print("pred label="+label+", #texts="+str(len(pred_true_txts))) dic_tupple_class_originalLabel = groupTxtByClass(pred_true_txts, True) maxMemInGroupSize = -1000000 maxMemOriginalLabel = "" for orgLabel, org_pred_true_txts in dic_tupple_class_originalLabel.items( ): # print("orgLabel label="+orgLabel+", #texts="+str(len(org_pred_true_txts))) if maxMemInGroupSize < len(org_pred_true_txts): maxMemInGroupSize = len(org_pred_true_txts) maxMemOriginalLabel = orgLabel # print("\n") # print(str(label)+" purity="+str(maxMemInGroupSize/len(pred_true_txts))+", items="+str(len(pred_true_txts))+", max match#="+str(maxMemInGroupSize)) # print_by_group(pred_true_txts) maxGroupSizeSum = maxGroupSizeSum + maxMemInGroupSize purity = maxGroupSizeSum / float(totalItems) print("purity majority whole data=" + str(purity)) return purity
def EvaluateByPurity(traintestFile): listtuple_pred_true_text = ReadPredTrueText(traintestFile) dic_tupple_class = groupTxtByClass(listtuple_pred_true_text, False) ComputePurity(dic_tupple_class)
def ClusterByHDbScan(listtuple_pred_true_text, avgItemsInCluster_in_a_batch): print("\nClusterByHDbScan") printClusterEvaluation_list(listtuple_pred_true_text) print(len(listtuple_pred_true_text), avgItemsInCluster_in_a_batch) dic_tupple_class_predicted = groupTxtByClass(listtuple_pred_true_text, False) numberOfClusters_predicted = len(dic_tupple_class_predicted) dic_tupple_class_true = groupTxtByClass(listtuple_pred_true_text, True) numberOfClusters_true = len(dic_tupple_class_true) print("numberOfClusters_true=" + str(numberOfClusters_true) + ", numberOfClusters_predicted=" + str(numberOfClusters_predicted)) train_data = [] train_predlabels = [] train_trueLabels = [] for pred_true_text in listtuple_pred_true_text: train_predlabels.append(pred_true_text[0]) train_trueLabels.append(pred_true_text[1]) train_data.append(pred_true_text[2]) vectorizer = TfidfVectorizer(max_df=1.0, min_df=1, stop_words='english', use_idf=True, smooth_idf=True, norm='l2') X = vectorizer.fit_transform(train_data) svd = TruncatedSVD(2) normalizer = Normalizer(copy=False) lsa = make_pipeline(svd, normalizer) X_svd = lsa.fit_transform(X) min_cluster_size_in_a_batch = int(math.ceil(avgItemsInCluster_in_a_batch)) min_cluster_size_in_a_batch = 2 clusterer = hdbscan.HDBSCAN(min_cluster_size=min_cluster_size_in_a_batch) clusterer.fit(X) X_hdbscan_labels = clusterer.labels_ print("X-total-clusters=" + str(X_hdbscan_labels.max())) print("Homogeneity: %0.4f" % metrics.homogeneity_score(train_trueLabels, X_hdbscan_labels)) print("Completeness: %0.4f" % metrics.completeness_score(train_trueLabels, X_hdbscan_labels)) print("V-measure: %0.4f" % metrics.v_measure_score(train_trueLabels, X_hdbscan_labels)) print("Adjusted Rand-Index: %.4f" % metrics.adjusted_rand_score(train_trueLabels, X_hdbscan_labels)) print("nmi_score-whole-data: %0.4f" % metrics.normalized_mutual_info_score( train_trueLabels, X_hdbscan_labels, average_method='arithmetic')) clusterer_svd = hdbscan.HDBSCAN( min_cluster_size=min_cluster_size_in_a_batch) clusterer_svd.fit(X_svd) X_svd_hdbscan_labels = clusterer_svd.labels_ db = DBSCAN().fit(X_svd) X_svd_dbscan_labels = db.labels_ print("X-svd-total-clusters=" + str(X_svd_hdbscan_labels.max())) print("Homogeneity: %0.4f" % metrics.homogeneity_score(train_trueLabels, X_svd_hdbscan_labels)) print("Completeness: %0.4f" % metrics.completeness_score(train_trueLabels, X_svd_hdbscan_labels)) print("V-measure: %0.4f" % metrics.v_measure_score(train_trueLabels, X_svd_hdbscan_labels)) print("Adjusted Rand-Index: %.4f" % metrics.adjusted_rand_score(train_trueLabels, X_svd_hdbscan_labels)) print("nmi_score-whole-data: %0.4f" % metrics.normalized_mutual_info_score(train_trueLabels, X_svd_hdbscan_labels, average_method='arithmetic')) print("X-svd-dbscan-total-clusters=" + str(X_svd_dbscan_labels.max())) print("Homogeneity: %0.4f" % metrics.homogeneity_score(train_trueLabels, X_svd_dbscan_labels)) print("Completeness: %0.4f" % metrics.completeness_score(train_trueLabels, X_svd_dbscan_labels)) print("V-measure: %0.4f" % metrics.v_measure_score(train_trueLabels, X_svd_dbscan_labels)) print("Adjusted Rand-Index: %.4f" % metrics.adjusted_rand_score(train_trueLabels, X_svd_dbscan_labels)) print("nmi_score-whole-data: %0.4f" % metrics.normalized_mutual_info_score(train_trueLabels, X_svd_dbscan_labels, average_method='arithmetic'))
def clusteringDCT(pred_true_txt_ind_prevPreds, wordVectorsDic, batchDocs, maxPredLabel): print("#m-stream-cleaned") Evaluate(pred_true_txt_ind_prevPreds) pred_true_text_ind_prevPreds_to_cluster, pred_true_text_ind_prevPreds_to_not_cluster = extrcatLargeClusterItems( pred_true_txt_ind_prevPreds) print("3 rd=" + str(pred_true_text_ind_prevPreds_to_cluster[0][3])) print("4 rd=" + str(pred_true_text_ind_prevPreds_to_cluster[0][4])) '''minPredToC, maxPredToC, minTrueToC, maxTrueToC=findMinMaxLabel(pred_true_text_ind_prevPreds_to_cluster) print("minPred, maxPred, minTrue, maxTrue=(pred_true_text_ind_prevPreds_to_cluster)") print(minPredToC, maxPredToC, minTrueToC, maxTrueToC) minPredToNC, maxPredToNC, minTrueToNC, maxTrueToNC=findMinMaxLabel(pred_true_text_ind_prevPreds_to_not_cluster) print("minPred, maxPred, minTrue, maxTrue=(pred_true_text_ind_prevPreds_to_not_cluster)") print(minPredToNC, maxPredToNC, minTrueToNC, maxTrueToNC)''' all_pred_clusters = len(groupTxtByClass(pred_true_txt_ind_prevPreds, False)) pred_clusters = len( groupTxtByClass(pred_true_text_ind_prevPreds_to_cluster, False)) non_pred_clusters = len( groupTxtByClass(pred_true_text_ind_prevPreds_to_not_cluster, False)) print("#clusters=" + str(pred_clusters)) print("#not clusters=" + str(non_pred_clusters)) print("this clustering with embedding DCT") pred_clusters = non_pred_clusters - pred_clusters print("#update clusters=" + str(pred_clusters)) nparr = np.array(pred_true_text_ind_prevPreds_to_cluster) print("3 rd=" + str(pred_true_text_ind_prevPreds_to_cluster[0][3])) print("4 rd=" + str(pred_true_text_ind_prevPreds_to_cluster[0][4])) preds = list(nparr[:, 0]) trues = list(nparr[:, 1]) texts = list(nparr[:, 2]) inds = list(nparr[:, 3]) prevPreds = list(nparr[:, 4]) skStopWords = getScikitLearn_StopWords() texts = processTextsRemoveStopWordTokenized(texts, skStopWords) '''dicDocFreq=getDocFreq(texts) dctCoffs=1 X=generate_sent_vecs_toktextdata_DCT(texts, wordVectorsDic, 300,dctCoffs) #vectorizer = TfidfVectorizer(tokenizer=stem_text,max_df=0.5,min_df=1) #vectorizer = TfidfVectorizer(max_df=0.5,min_df=2, stop_words='english') #X = vectorizer.fit_transform(texts)''' '''svd = TruncatedSVD(50) #svd = PCA(n_components=50) normalizer = Normalizer(copy=False) lsa = make_pipeline(svd, normalizer) #X=X.toarray() X = lsa.fit_transform(X)''' '''km = KMeans(n_clusters=pred_clusters, init='k-means++', max_iter=100,random_state=0) km.fit(X) list_km_pred_true_text=combine_pred_true_txt_from_list(km.labels_, trues, texts) print("#k-means") Evaluate(list_km_pred_true_text)''' '''ward = AgglomerativeClustering(n_clusters=pred_clusters, linkage='ward').fit(X) list_hr_pred_true_text=combine_pred_true_txt_from_list(ward.labels_, trues, texts) print("#hr-ward-DCT") print(min(ward.labels_), max(ward.labels_)) pred_true_text_ind_prevPreds_to_not_cluster_hr=change_pred_label(pred_true_text_ind_prevPreds_to_not_cluster, pred_clusters+1) Evaluate(list_hr_pred_true_text) Evaluate(list_hr_pred_true_text+pred_true_text_ind_prevPreds_to_not_cluster_hr) ''' X = generate_sent_vecs_toktextdata(texts, wordVectorsDic, 300) ward = AgglomerativeClustering(n_clusters=pred_clusters, linkage='ward').fit(X) list_hr_pred_true_text_ind_prevPred = np.column_stack( (ward.labels_, trues, texts, inds, prevPreds)).tolist() print("#hr-ward-AVG") pred_true_text_ind_prevPreds_to_not_cluster_hr = change_pred_label( pred_true_text_ind_prevPreds_to_not_cluster, pred_clusters + 1) Evaluate(list_hr_pred_true_text_ind_prevPred) Evaluate(list_hr_pred_true_text_ind_prevPred + pred_true_text_ind_prevPreds_to_not_cluster_hr) #print_by_group(list_hr_pred_true_text+pred_true_text_ind_prevPreds_to_not_cluster_hr) print("#spectral-avg") clustering = SpectralClustering(n_clusters=pred_clusters, assign_labels="discretize", random_state=0).fit(X) list_sp_pred_true_text_ind_prevPred = np.column_stack( (clustering.labels_, trues, texts, inds, prevPreds)).tolist() pred_true_text_ind_prevPreds_to_not_cluster_spec = change_pred_label( pred_true_text_ind_prevPreds_to_not_cluster, pred_clusters + 1) Evaluate(list_sp_pred_true_text_ind_prevPred) Evaluate(list_sp_pred_true_text_ind_prevPred + pred_true_text_ind_prevPreds_to_not_cluster_spec)
def ObtainNumberOfClusters(isByTrueLabel, listtuple_pred_true_text) : dic_tupple_class=groupTxtByClass(listtuple_pred_true_text, isByTrueLabel) num_clusters= len(dic_tupple_class) return num_clusters
def print_by_group(listtuple_pred_true_text): dic_tupple_class = groupTxtByClass(listtuple_pred_true_text, False) for label, pred_true_txts in sorted(dic_tupple_class.items()): Print_list_pred_true_text(pred_true_txts)
def clusterByTfIdfFeature(list_pred_true_text): print("pred_mstreams") printClusterEvaluation_list(list_pred_true_text) dic_tupple_class = groupTxtByClass(list_pred_true_text, False) pred_clusters = len(dic_tupple_class) print("pred_clusters for k-means=" + str(pred_clusters)) preds, trues, texts = split_pred_true_txt_from_list(list_pred_true_text) skStopWords = getScikitLearn_StopWords() texts = processTextsRemoveStopWordTokenized(texts, skStopWords) vectorizer = TfidfVectorizer(tokenizer=stem_text, max_df=0.5, min_df=2) #vectorizer = TfidfVectorizer(max_df=0.5,min_df=2, stop_words='english') X = vectorizer.fit_transform(texts) svd = TruncatedSVD(100) #svd = PCA(n_components=50) normalizer = Normalizer(copy=False) lsa = make_pipeline(svd, normalizer) #X=X.toarray() X = lsa.fit_transform(X) km = KMeans(n_clusters=pred_clusters, init='k-means++', max_iter=100, random_state=0) km.fit(X) list_km_pred_true_text = combine_pred_true_txt_from_list( km.labels_, trues, texts) print("k-means") printClusterEvaluation_list(list_km_pred_true_text) ward = AgglomerativeClustering(n_clusters=pred_clusters, linkage='ward').fit(X) list_hr_pred_true_text = combine_pred_true_txt_from_list( ward.labels_, trues, texts) print("hr-ward") printClusterEvaluation_list(list_hr_pred_true_text) clustering = SpectralClustering(n_clusters=pred_clusters, assign_labels="discretize", random_state=0).fit(X) list_sp_pred_true_text = combine_pred_true_txt_from_list( clustering.labels_, trues, texts) print("spectral") printClusterEvaluation_list(list_sp_pred_true_text) brc = Birch(branching_factor=50, n_clusters=pred_clusters, threshold=0.5, compute_labels=True) brc.fit_predict(X) list_brc_pred_true_text = combine_pred_true_txt_from_list( brc.labels_, trues, texts) print("brc") printClusterEvaluation_list(list_brc_pred_true_text) gmm = GaussianMixture(n_components=pred_clusters, covariance_type='full') gmm_labels = gmm.fit_predict(X) list_gmm_pred_true_text = combine_pred_true_txt_from_list( gmm_labels, trues, texts) print("gmm") printClusterEvaluation_list(list_gmm_pred_true_text)