def mergeByCommonWords(dic_biGram_to_textInds, dic_triGram_to_textInds, dic_bitri_keys_selectedClusters_seenBatch, minCommomGram, t_minSize, t_maxSize, b_minSize, b_maxSize):
  new_dic_bitri_keys_selectedClusters_seenBatch={}
  
  keys_list=dic_bitri_keys_selectedClusters_seenBatch.keys()
  for key, txtInds in dic_triGram_to_textInds.items():
    #try the key to merge with big dic  
	#if can not merge with big dic, then add the key to the big dic
    txtInds=list(set(txtInds))
     	
    gram_clusterSize=len(txtInds)
    if gram_clusterSize<=t_minSize:
      continue	
    #keys_list=dic_bitri_keys_selectedClusters_seenBatch.keys()
    word_arr=key.split(' ')	
    closeKey_Lexical=findCloseCluster_GramKey_lexical(keys_list,word_arr,2)
    if closeKey_Lexical==None:
      closeKey_Lexical=key	
      dic_bitri_keys_selectedClusters_seenBatch[key]=list(set(txtInds))
    else:
      dic_bitri_keys_selectedClusters_seenBatch[closeKey_Lexical]=list(set(dic_bitri_keys_selectedClusters_seenBatch[closeKey_Lexical]+txtInds))
	  
    if len(dic_bitri_keys_selectedClusters_seenBatch[closeKey_Lexical])<=t_minSize:
      del dic_bitri_keys_selectedClusters_seenBatch[closeKey_Lexical]
      continue	  
	  
    #print(closeKey_Lexical, dic_bitri_keys_selectedClusters_seenBatch[closeKey_Lexical])	  
  
  
  keys_list=dic_bitri_keys_selectedClusters_seenBatch.keys()
  for key, txtInds in dic_biGram_to_textInds.items():
    #try the key to merge with big dic  
	#if can not merge with big dic, then add the key to the big dic
    txtInds=list(set(txtInds))

    gram_clusterSize=len(txtInds)
    if gram_clusterSize<=b_minSize:
      continue	
    #keys_list=dic_bitri_keys_selectedClusters_seenBatch.keys()
    word_arr=key.split(' ')	
    closeKey_Lexical=findCloseCluster_GramKey_lexical(keys_list,word_arr,2)
    if closeKey_Lexical==None:	
      closeKey_Lexical=key	
      dic_bitri_keys_selectedClusters_seenBatch[key]=list(set(txtInds))
    else:
      dic_bitri_keys_selectedClusters_seenBatch[closeKey_Lexical]=list(set(dic_bitri_keys_selectedClusters_seenBatch[closeKey_Lexical]+txtInds))
	  
    if len(dic_bitri_keys_selectedClusters_seenBatch[closeKey_Lexical])<=b_minSize:
      del dic_bitri_keys_selectedClusters_seenBatch[closeKey_Lexical]
      continue

    #print(closeKey_Lexical, dic_bitri_keys_selectedClusters_seenBatch[closeKey_Lexical])	  
	  
  new_dic_bitri_keys_selectedClusters_seenBatch=dic_bitri_keys_selectedClusters_seenBatch	  
  
  return new_dic_bitri_keys_selectedClusters_seenBatch
def assignToClusterBySimilarity(not_clustered_inds_seen_batch, seen_list_pred_true_words_index, dic_combined_keys_selectedClusters, wordVectorsDic):
  new_not_clustered_inds_seen_batch=[]
  dic_preds={}
  count=0
  keys_list=list(dic_combined_keys_selectedClusters.keys())
  for txtInd in not_clustered_inds_seen_batch:
    pred_true_words_index= seen_list_pred_true_words_index[txtInd]
    word_arr=pred_true_words_index[2]
    closeKey_Lexical=findCloseCluster_GramKey_lexical(keys_list,word_arr,2)
    closeKey_Semantic=findCloseCluster_GramKey_Semantic(keys_list,word_arr,1, wordVectorsDic)
    seen_item=seen_list_pred_true_words_index[txtInd]	
    if closeKey_Lexical != None:
      dic_preds.setdefault(closeKey_Lexical,[]).append(txtInd)
      count+=1	
	  
      new_not_clustered_inds_seen_batch.append([closeKey_Lexical ,seen_item[1], seen_item[2], seen_item[3]])  	  
    else:
      if closeKey_Semantic !=None:
        dic_preds.setdefault(closeKey_Semantic,[]).append(txtInd)
        count+=1
		
        new_not_clustered_inds_seen_batch.append([closeKey_Semantic ,seen_item[1], seen_item[2], seen_item[3]])		
        	  
	
        	
  total_dic_items=sum([len(set(dic_combined_keys_selectedClusters[x])) for x in dic_combined_keys_selectedClusters if isinstance(dic_combined_keys_selectedClusters[x], list)])
  print("batch-eval: asign count "+ str(count)+"," +str(len(not_clustered_inds_seen_batch))+", total_dic_items,"+str(total_dic_items))
  #print("batch-eval: total_dic_items", total_dic_items)
  return [dic_preds, new_not_clustered_inds_seen_batch]
예제 #3
0
def assignToMergedClusters(list_pred_true_words_index, not_clustered_inds,
                           dicMerged_keys_selectedClusters, minMatch,
                           seen_list_pred_true_words_index):
    #new_dicMerged_keys_selectedClusters={} #key =[txtInds w.r.t to sublist list_pred_true_words_index]
    new_not_clustered_inds = []
    keys_list = list(dicMerged_keys_selectedClusters.keys())
    for txtInd in not_clustered_inds:
        '''item=list_pred_true_words_index[txtInd]'''
        item = seen_list_pred_true_words_index[txtInd]
        word_arr = item[2]
        closeKey_Lexical = findCloseCluster_GramKey_lexical(
            keys_list, word_arr, minMatch)
        if closeKey_Lexical == None:
            new_not_clustered_inds.append(txtInd)
        else:
            #print("list before close key", dicMerged_keys_selectedClusters[closeKey_Lexical])
            '''print("closeKey_Lexical", closeKey_Lexical+",",
	  list_pred_true_words_index[txtInd])'''
            print("closeKey_Lexical", closeKey_Lexical + ",",
                  seen_list_pred_true_words_index[txtInd])
            new_list = dicMerged_keys_selectedClusters[closeKey_Lexical]
            new_list.append(txtInd)
            dicMerged_keys_selectedClusters[closeKey_Lexical] = new_list
            for lid in new_list:
                '''print("new_list,", list_pred_true_words_index[lid])'''
                print("new_list,", seen_list_pred_true_words_index[lid])
            #print("list after close key", dicMerged_keys_selectedClusters[closeKey_Lexical])

    texts_clustered_sum = 0
    max_group_sum = 0
    for mergedKey, txtInds in dicMerged_keys_selectedClusters.items():
        #txtInds=list(set(txtInds))
        #print("mergedKey->", mergedKey, txtInds)
        texts_clustered_sum += len(txtInds)
        true_label_list = []
        for txtInd in txtInds:
            '''true_label_list.append(list_pred_true_words_index[txtInd][1])'''
            true_label_list.append(seen_list_pred_true_words_index[txtInd][1])

        max_group_sum += max(Counter(true_label_list).values())

    print("\nnew_not_clustered_inds", len(new_not_clustered_inds),
          max_group_sum, texts_clustered_sum,
          max_group_sum / texts_clustered_sum, "old_not_clustered_inds",
          len(not_clustered_inds))
    return [dicMerged_keys_selectedClusters, new_not_clustered_inds]