def mergeByCommonWords(dic_biGram_to_textInds, dic_triGram_to_textInds, dic_bitri_keys_selectedClusters_seenBatch, minCommomGram, t_minSize, t_maxSize, b_minSize, b_maxSize): new_dic_bitri_keys_selectedClusters_seenBatch={} keys_list=dic_bitri_keys_selectedClusters_seenBatch.keys() for key, txtInds in dic_triGram_to_textInds.items(): #try the key to merge with big dic #if can not merge with big dic, then add the key to the big dic txtInds=list(set(txtInds)) gram_clusterSize=len(txtInds) if gram_clusterSize<=t_minSize: continue #keys_list=dic_bitri_keys_selectedClusters_seenBatch.keys() word_arr=key.split(' ') closeKey_Lexical=findCloseCluster_GramKey_lexical(keys_list,word_arr,2) if closeKey_Lexical==None: closeKey_Lexical=key dic_bitri_keys_selectedClusters_seenBatch[key]=list(set(txtInds)) else: dic_bitri_keys_selectedClusters_seenBatch[closeKey_Lexical]=list(set(dic_bitri_keys_selectedClusters_seenBatch[closeKey_Lexical]+txtInds)) if len(dic_bitri_keys_selectedClusters_seenBatch[closeKey_Lexical])<=t_minSize: del dic_bitri_keys_selectedClusters_seenBatch[closeKey_Lexical] continue #print(closeKey_Lexical, dic_bitri_keys_selectedClusters_seenBatch[closeKey_Lexical]) keys_list=dic_bitri_keys_selectedClusters_seenBatch.keys() for key, txtInds in dic_biGram_to_textInds.items(): #try the key to merge with big dic #if can not merge with big dic, then add the key to the big dic txtInds=list(set(txtInds)) gram_clusterSize=len(txtInds) if gram_clusterSize<=b_minSize: continue #keys_list=dic_bitri_keys_selectedClusters_seenBatch.keys() word_arr=key.split(' ') closeKey_Lexical=findCloseCluster_GramKey_lexical(keys_list,word_arr,2) if closeKey_Lexical==None: closeKey_Lexical=key dic_bitri_keys_selectedClusters_seenBatch[key]=list(set(txtInds)) else: dic_bitri_keys_selectedClusters_seenBatch[closeKey_Lexical]=list(set(dic_bitri_keys_selectedClusters_seenBatch[closeKey_Lexical]+txtInds)) if len(dic_bitri_keys_selectedClusters_seenBatch[closeKey_Lexical])<=b_minSize: del dic_bitri_keys_selectedClusters_seenBatch[closeKey_Lexical] continue #print(closeKey_Lexical, dic_bitri_keys_selectedClusters_seenBatch[closeKey_Lexical]) new_dic_bitri_keys_selectedClusters_seenBatch=dic_bitri_keys_selectedClusters_seenBatch return new_dic_bitri_keys_selectedClusters_seenBatch
def assignToClusterBySimilarity(not_clustered_inds_seen_batch, seen_list_pred_true_words_index, dic_combined_keys_selectedClusters, wordVectorsDic): new_not_clustered_inds_seen_batch=[] dic_preds={} count=0 keys_list=list(dic_combined_keys_selectedClusters.keys()) for txtInd in not_clustered_inds_seen_batch: pred_true_words_index= seen_list_pred_true_words_index[txtInd] word_arr=pred_true_words_index[2] closeKey_Lexical=findCloseCluster_GramKey_lexical(keys_list,word_arr,2) closeKey_Semantic=findCloseCluster_GramKey_Semantic(keys_list,word_arr,1, wordVectorsDic) seen_item=seen_list_pred_true_words_index[txtInd] if closeKey_Lexical != None: dic_preds.setdefault(closeKey_Lexical,[]).append(txtInd) count+=1 new_not_clustered_inds_seen_batch.append([closeKey_Lexical ,seen_item[1], seen_item[2], seen_item[3]]) else: if closeKey_Semantic !=None: dic_preds.setdefault(closeKey_Semantic,[]).append(txtInd) count+=1 new_not_clustered_inds_seen_batch.append([closeKey_Semantic ,seen_item[1], seen_item[2], seen_item[3]]) total_dic_items=sum([len(set(dic_combined_keys_selectedClusters[x])) for x in dic_combined_keys_selectedClusters if isinstance(dic_combined_keys_selectedClusters[x], list)]) print("batch-eval: asign count "+ str(count)+"," +str(len(not_clustered_inds_seen_batch))+", total_dic_items,"+str(total_dic_items)) #print("batch-eval: total_dic_items", total_dic_items) return [dic_preds, new_not_clustered_inds_seen_batch]
def assignToMergedClusters(list_pred_true_words_index, not_clustered_inds, dicMerged_keys_selectedClusters, minMatch, seen_list_pred_true_words_index): #new_dicMerged_keys_selectedClusters={} #key =[txtInds w.r.t to sublist list_pred_true_words_index] new_not_clustered_inds = [] keys_list = list(dicMerged_keys_selectedClusters.keys()) for txtInd in not_clustered_inds: '''item=list_pred_true_words_index[txtInd]''' item = seen_list_pred_true_words_index[txtInd] word_arr = item[2] closeKey_Lexical = findCloseCluster_GramKey_lexical( keys_list, word_arr, minMatch) if closeKey_Lexical == None: new_not_clustered_inds.append(txtInd) else: #print("list before close key", dicMerged_keys_selectedClusters[closeKey_Lexical]) '''print("closeKey_Lexical", closeKey_Lexical+",", list_pred_true_words_index[txtInd])''' print("closeKey_Lexical", closeKey_Lexical + ",", seen_list_pred_true_words_index[txtInd]) new_list = dicMerged_keys_selectedClusters[closeKey_Lexical] new_list.append(txtInd) dicMerged_keys_selectedClusters[closeKey_Lexical] = new_list for lid in new_list: '''print("new_list,", list_pred_true_words_index[lid])''' print("new_list,", seen_list_pred_true_words_index[lid]) #print("list after close key", dicMerged_keys_selectedClusters[closeKey_Lexical]) texts_clustered_sum = 0 max_group_sum = 0 for mergedKey, txtInds in dicMerged_keys_selectedClusters.items(): #txtInds=list(set(txtInds)) #print("mergedKey->", mergedKey, txtInds) texts_clustered_sum += len(txtInds) true_label_list = [] for txtInd in txtInds: '''true_label_list.append(list_pred_true_words_index[txtInd][1])''' true_label_list.append(seen_list_pred_true_words_index[txtInd][1]) max_group_sum += max(Counter(true_label_list).values()) print("\nnew_not_clustered_inds", len(new_not_clustered_inds), max_group_sum, texts_clustered_sum, max_group_sum / texts_clustered_sum, "old_not_clustered_inds", len(not_clustered_inds)) return [dicMerged_keys_selectedClusters, new_not_clustered_inds]