示例#1
0
def Evaluate_old(listtuple_pred_true_text, ignoreMinusOne=False):
    preds = []
    trues = []

    new_listtuple_pred_true_text = []

    totalwords = 0

    for pred_true_text in listtuple_pred_true_text:
        if str(pred_true_text[1]) == '-1' and ignoreMinusOne == True:
            continue

        preds.append(pred_true_text[0])
        trues.append(pred_true_text[1])
        new_listtuple_pred_true_text.append(
            [pred_true_text[0], pred_true_text[1], pred_true_text[2]])

        totalwords += len(pred_true_text[2])
        # print(pred_true_text[2], totalwords)

    print("evaluate total texts=" + str(len(new_listtuple_pred_true_text)))

    score = metrics.homogeneity_score(trues, preds)
    print("homogeneity_score-whole-data:   %0.8f" % score)

    score = metrics.completeness_score(trues, preds)
    print("completeness_score-whole-data:   %0.8f" % score)

    score = metrics.v_measure_score(trues, preds)
    print("v_measure_score-whole-data:   %0.8f" % score)

    score = metrics.normalized_mutual_info_score(trues,
                                                 preds,
                                                 average_method='arithmetic')
    print("nmi_score-whole-data:   %0.8f" % score)

    # score=metrics.adjusted_mutual_info_score(trues, preds)
    # print ("adjusted_mutual_info_score-whole-data:   %0.4f" % score)

    # score=metrics.adjusted_rand_score(trues, preds)
    # print ("adjusted_rand_score-whole-data:   %0.4f" % score)

    dic_tupple_class = groupItemsBySingleKeyIndex(new_listtuple_pred_true_text,
                                                  0)  # before 0
    dic_tupple_class_true = groupItemsBySingleKeyIndex(
        new_listtuple_pred_true_text, 1)  # before 1
    print("pred clusters=" + str(len(dic_tupple_class)) + ", true clusters=" +
          str(len(dic_tupple_class_true)))
    ComputePurity(dic_tupple_class)
    li = [
        len(dic_tupple_class_true[x]) for x in dic_tupple_class_true
        if isinstance(dic_tupple_class_true[x], list)
    ]
    print('min', min(li), 'max', max(li), 'median', statistics.median(li),
          'avg', statistics.mean(li), 'std', statistics.stdev(li), 'sum of li',
          sum(li))
    print('avg words per text', totalwords / len(new_listtuple_pred_true_text),
          'totalwords', totalwords, '#texts',
          len(new_listtuple_pred_true_text))
    '''print("---Pred distribution")
示例#2
0
def Evaluate(listtuple_pred_true_text):
    print("evaluate total texts=" + str(len(listtuple_pred_true_text)))
    preds = []
    trues = []
    for pred_true_text in listtuple_pred_true_text:
        preds.append(pred_true_text[0])
        trues.append(pred_true_text[1])

    score = metrics.homogeneity_score(trues, preds)
    print("homogeneity_score-whole-data:   %0.8f" % score)

    score = metrics.completeness_score(trues, preds)
    print("completeness_score-whole-data:   %0.8f" % score)

    #score=metrics.v_measure_score(trues, preds)
    #print ("v_measure_score-whole-data:   %0.4f" % score)

    score = metrics.normalized_mutual_info_score(trues,
                                                 preds,
                                                 average_method='arithmetic')
    print("nmi_score-whole-data:   %0.8f" % score)

    #score=metrics.adjusted_mutual_info_score(trues, preds)
    #print ("adjusted_mutual_info_score-whole-data:   %0.4f" % score)

    #score=metrics.adjusted_rand_score(trues, preds)
    #print ("adjusted_rand_score-whole-data:   %0.4f" % score)

    dic_tupple_class = groupItemsBySingleKeyIndex(listtuple_pred_true_text, 0)
    dic_tupple_class_true = groupItemsBySingleKeyIndex(
        listtuple_pred_true_text, 1)
    print("pred clusters=" + str(len(dic_tupple_class)) + ", true clusters=" +
          str(len(dic_tupple_class_true)))
    ComputePurity(dic_tupple_class)
    '''print("---Pred distribution")
示例#3
0
def ComputePurity(dic_tupple_class, groupByIndex=1):
    totalItems = 0
    maxGroupSizeSum = 0
    for label, pred_true_txts in dic_tupple_class.items():
        totalItems = totalItems + len(pred_true_txts)
        # print("pred label="+label+", #texts="+str(len(pred_true_txts)))
        dic_tupple_class_originalLabel = groupItemsBySingleKeyIndex(
            pred_true_txts, groupByIndex)
        maxMemInGroupSize = -1000000
        maxMemOriginalLabel = ""
        for orgLabel, org_pred_true_txts in dic_tupple_class_originalLabel.items(
        ):
            # print("orgLabel label="+orgLabel+", #texts="+str(len(org_pred_true_txts)))
            if maxMemInGroupSize < len(org_pred_true_txts):
                maxMemInGroupSize = len(org_pred_true_txts)
                maxMemOriginalLabel = orgLabel

        # print("\n")
        # print(str(label)+" purity="+str(maxMemInGroupSize/len(pred_true_txts))+", items="+str(len(pred_true_txts))+", max match#="+str(maxMemInGroupSize))
        # print_by_group(pred_true_txts)
        maxGroupSizeSum = maxGroupSizeSum + maxMemInGroupSize

    purity = maxGroupSizeSum / float(totalItems)
    print("acc majority whole data=" + str(purity))
    return purity
示例#4
0
def populateClusterReps(all_global, wordVectorsDic, embedDim):
    dic_cluster_rep_words = {}
    dic_cluster_rep_vec = {}
    dic_tupple_class = groupItemsBySingleKeyIndex(all_global, 0)
    for predKey, items in dic_tupple_class.items():
        clus_words = []
        #can filter some words using word entropy based on clus distributions.
        for item in items:
            words = item[2]
            clus_words.extend(words)

        dic_word_counts = Counter(clus_words)
        wordCounts = dic_word_counts.values()
        mean = 0
        if len(wordCounts) >= 1:
            mean = statistics.mean(wordCounts)
        std = mean
        if len(wordCounts) >= 2:
            std = statistics.stdev(wordCounts)
        dic_word_counts_filtered = {}

        for key, counts in dic_word_counts.items():
            if counts > mean + std:
                dic_word_counts_filtered[key] = counts

        if len(dic_word_counts_filtered) <= 2:
            dic_word_counts_filtered = {}
            for key, counts in dic_word_counts.items():
                if counts > 1:
                    dic_word_counts_filtered[key] = counts
            #if len(dic_word_counts_filtered)<=2:
            #  dic_word_counts_filtered={}
            #  for key, counts in dic_word_counts.items():
            #    dic_word_counts_filtered[key]=counts

        clus_words = list(dic_word_counts_filtered.keys())
        clus_word_counts = list(dic_word_counts_filtered.values())
        cent_Vec_words = generate_sent_vecs_toktextdata([clus_words],
                                                        wordVectorsDic,
                                                        embedDim)[0]

        dic_cluster_rep_words[predKey] = [
            dic_word_counts_filtered,
            sum(clus_word_counts)
        ]
        dic_cluster_rep_vec[predKey] = cent_Vec_words

        #print(dic_cluster_rep_words[predKey])
        #print(dic_cluster_rep_vec[predKey])

    return [dic_cluster_rep_words, dic_cluster_rep_vec]
def RenameTrueLabel(pred_true_texts,startTrueSeed, startIdSeed):
  lastTrueLabel=startTrueSeed
  lastId=startIdSeed
  renamed_pred_true_texts=[]  
  dic_group = groupItemsBySingleKeyIndex(pred_true_texts,1)
  groupsLen=len(dic_group)
  #print(groupsLen)
  for trueLabel, items_pred_true_text in dic_group.items():
    lastTrueLabel=lastTrueLabel+1
    for item_pred_true_text in items_pred_true_text:
      lastId=lastId+1
      renamed_pred_true_texts.append([str(lastId).zfill(6), str(lastTrueLabel), item_pred_true_text[2]])	  
	  
  return [lastTrueLabel, lastId, renamed_pred_true_texts]  
def clusterByWordEmbeddingIntelligent(list_pred_true_text_ind_prevind,
                                      wordVectorsDic):
    print("pred_mstreams")
    printClusterEvaluation_list(list_pred_true_text_ind_prevind)
    dic_itemGroups = groupItemsBySingleKeyIndex(
        list_pred_true_text_ind_prevind, 0)

    pred_clusters = int(len(dic_itemGroups) /
                        1.0)  #needs to be determined carefully

    dic_group_sizes = [
        len(dic_itemGroups[x]) for x in dic_itemGroups
        if isinstance(dic_itemGroups[x], list)
    ]
    print(dic_group_sizes)

    print("#clusters=" + str(pred_clusters))

    nparr = np.array(list_pred_true_text_ind_prevind)
    preds = list(nparr[:, 0])
    trues = list(nparr[:, 1])
    word_arr = list(nparr[:, 2])
    inds = list(nparr[:, 3])
    X = generate_sent_vecs_toktextdata(word_arr, wordVectorsDic, 300)
    #X=generate_sent_vecs_toktextdata_autoencoder(word_arr, wordVectorsDic, 300, pred_clusters)

    svd = TruncatedSVD(50)
    #svd = PCA(n_components=50)
    normalizer = Normalizer(copy=False)
    lsa = make_pipeline(svd, normalizer)
    #X=X.toarray()
    X = lsa.fit_transform(X)

    ward = AgglomerativeClustering(n_clusters=pred_clusters,
                                   linkage='ward').fit(X)
    list_hr_pred_true_text = combine_pred_true_txt_from_list(
        ward.labels_, trues, word_arr)
    print("hr-ward")
    printClusterEvaluation_list(list_hr_pred_true_text)

    clustering = SpectralClustering(n_clusters=pred_clusters,
                                    assign_labels="discretize",
                                    random_state=0).fit(X)
    list_sp_pred_true_text = combine_pred_true_txt_from_list(
        clustering.labels_, trues, word_arr)
    print("spectral")
    printClusterEvaluation_list(list_sp_pred_true_text)
示例#7
0
def Evaluate_old(listtuple_pred_true_text, ignoreMinusOne=False):
    preds = []
    trues = []

    new_listtuple_pred_true_text = []

    totalwords = 0

    for pred_true_text in listtuple_pred_true_text:
        if str(pred_true_text[1]) == '-1' and ignoreMinusOne:
            continue

        preds.append(pred_true_text[0])
        trues.append(pred_true_text[1])
        new_listtuple_pred_true_text.append(
            [pred_true_text[0], pred_true_text[1], pred_true_text[2]])

        totalwords += len(pred_true_text[2])
        # print(pred_true_text[2], totalwords)

    print("evaluate total texts=" + str(len(new_listtuple_pred_true_text)))

    score = metrics.homogeneity_score(trues, preds)
    print("homogeneity_score-whole-data:   %0.8f" % score)

    score = metrics.completeness_score(trues, preds)
    print("completeness_score-whole-data:   %0.8f" % score)

    score = metrics.v_measure_score(trues, preds)
    print("v_measure_score-whole-data:   %0.8f" % score)

    score = metrics.normalized_mutual_info_score(trues,
                                                 preds,
                                                 average_method='arithmetic')
    print("nmi_score-whole-data:   %0.8f" % score)

    # score=metrics.adjusted_mutual_info_score(trues, preds)
    # print ("adjusted_mutual_info_score-whole-data:   %0.4f" % score)

    # score=metrics.adjusted_rand_score(trues, preds)
    # print ("adjusted_rand_score-whole-data:   %0.4f" % score)

    dic_tupple_class = groupItemsBySingleKeyIndex(new_listtuple_pred_true_text,
                                                  0)  # before 0
    dic_tupple_class_true = groupItemsBySingleKeyIndex(
        new_listtuple_pred_true_text, 1)  # before 1
    print("pred clusters=" + str(len(dic_tupple_class)) + ", true clusters=" +
          str(len(dic_tupple_class_true)))
    ComputePurity(dic_tupple_class)

    print('wrong:avg words per text',
          totalwords / len(new_listtuple_pred_true_text), 'totalwords',
          totalwords, '#texts', len(new_listtuple_pred_true_text))

    keysByLength = sorted(dic_tupple_class,
                          key=lambda key: len(dic_tupple_class[key]),
                          reverse=True)
    li = []
    for key in keysByLength:
        print('clusterid=', key, '#items', len(dic_tupple_class[key]))
        li.append(len(dic_tupple_class[key]))
    print('2nd:min', min(li), 'max', max(li), 'median', statistics.median(li),
          'avg', statistics.mean(li), 'std', statistics.stdev(li), 'sum of li',
          sum(li))
示例#8
0
def print_by_group(listtuple_pred_true_text, grIndex):
  dic_tupple_class=groupItemsBySingleKeyIndex(listtuple_pred_true_text, grIndex)
  for label, pred_true_txts in sorted(dic_tupple_class.items()):
    Print_list_pred_true_text(pred_true_txts)
  print("total groups=", len(dic_tupple_class))	
示例#9
0
#from general_util import Print_list_pred_true_text

gloveFile = "/home/owner/PhD/dr.norbert/dataset/shorttext/glove.42B.300d/glove.42B.300d.txt"

listtuple_pred_true_text = ReadPredTrueText("result/batchId_PredTrueText1")
newList = []
i = -1
for pred_true_text in listtuple_pred_true_text:
    i = i + 1
    newList.append(pred_true_text + [i, i])
listtuple_pred_true_text = newList

listtuple_pred_true_text = RemoveHighClusterEntropyWordsIndex(
    listtuple_pred_true_text)

dic_tupple_class = groupItemsBySingleKeyIndex(listtuple_pred_true_text, 0)

#wordVectorsDic = extractAllWordVecs(gloveFile, 300)

for label, cluster_pred_true_txt_inds in dic_tupple_class.items():
    _components, newPred_OldPred_true_text_inds = clusterByConnectedComponentIndex(
        cluster_pred_true_txt_inds)
    #print(_components, newPred_OldPred_true_text_inds)
    dic_new_tupple_class = groupItemsBySingleKeyIndex(
        newPred_OldPred_true_text_inds, 0)

    for newLabel, cluster_newPred_OldPred_true_text_inds in dic_new_tupple_class.items(
    ):
        print("newLabel", newLabel)
        #print_by_group(cluster_newPred_OldPred_true_text_inds)
示例#10
0
                break
            # if h_count > max_hit:
            #    break

        # if h_count > max_hit:
        #    break

        if cluscount > 1000:
            break

    if not found:
        print('not\t' + str(h_count) + '\t' + str(test_oCPost.soPostId) + '\t' + str(test_oCPost.tagWords)+'\t'+str(test_oCPost.trueLabel))


listtuple_pred_true_text = ReadPredTrueText(clusterWriterFile)
dic_tupple_class = groupItemsBySingleKeyIndex(listtuple_pred_true_text, 0)  # before 0
# print(dic_tupple_class)
dic_term_clusterIds, dic_cluster_ftrs, dic_cluster_size = createTermToClsuetrId(dic_tupple_class)





#############test
test_list_CPost = readStackOverflowDataSetTagTitleBody(testFile)
# print(test_list_CPost)
for oCPost in test_list_CPost:
    terms = oCPost.tagWords
    test_term_dict = Counter(terms)
    test_term_size = len(terms)