def findCloseCluster_GramKey_Semantic(keys_list, word_arr, minMatch, wordVectorsDic, euclidean=True):
  closeKey_Semantic=None
  sent_vec=generate_sent_vecs_toktextdata([word_arr], wordVectorsDic, 300)[0]
  min_dist=sys.float_info.max
  max_sim=0  
  for key in keys_list:
    key_words=key.split(' ') 
    set1=set(key_words)
    set2=set(word_arr)
    common=set1.intersection(set2)	
    key_vec=generate_sent_vecs_toktextdata([key_words], wordVectorsDic, 300)[0]
    #eu_dist=0	
    #if euclidean==True:	
    #  eu_dist=distance.euclidean(sent_vec, key_vec)
    #else:
    eu_dist=cosine(sent_vec, key_vec) #cosine=distance
    sim=1-eu_dist	
    #if len(common)>=minMatch and min_dist>eu_dist:
    if len(common)>=minMatch and max_sim<sim: 	
      #min_dist=eu_dist
      max_sim=sim 	  
      closeKey_Semantic=key	  
    	
    	

  
  return [closeKey_Semantic, max_sim]  
  
Exemplo n.º 2
0
def buildNGramIndex(list_pred_true_words_index_postid_createtime):
  for item in list_pred_true_words_index_postid_createtime:
    words=item[2]	 
    txtId=item[3] 
    #print('process index for', item)	
	
    text_Vec=None
	
    if isSemantic==True:	
      if txtId in dic_txtId__vec:
        text_Vec=	dic_txtId__vec[txtId]
      else:
        X=generate_sent_vecs_toktextdata([words], wordVectorsDic, embedDim)
        text_Vec=X[0] 

      dic_txtId__vec[txtId]=text_Vec	  
	
    	
	
    dic_txtId__text[txtId]=item     
	
    grams=generateGramsConsucetive(words,min_gram,max_gram) #len(words))	
    
    for gram in grams:
      dic_ngram__txtIds.setdefault(gram, []).append(txtId)

      
      if isSemantic==True:	  
        if gram in dic_ngram__center:
          dic_ngram__center[gram]=list( map(add, dic_ngram__center[gram], text_Vec) )
        else:
          dic_ngram__center[gram]=text_Vec
Exemplo n.º 3
0
def populateClusterReps(all_global, wordVectorsDic, embedDim):
    dic_cluster_rep_words = {}
    dic_cluster_rep_vec = {}
    dic_tupple_class = groupItemsBySingleKeyIndex(all_global, 0)
    for predKey, items in dic_tupple_class.items():
        clus_words = []
        #can filter some words using word entropy based on clus distributions.
        for item in items:
            words = item[2]
            clus_words.extend(words)

        dic_word_counts = Counter(clus_words)
        wordCounts = dic_word_counts.values()
        mean = 0
        if len(wordCounts) >= 1:
            mean = statistics.mean(wordCounts)
        std = mean
        if len(wordCounts) >= 2:
            std = statistics.stdev(wordCounts)
        dic_word_counts_filtered = {}

        for key, counts in dic_word_counts.items():
            if counts > mean + std:
                dic_word_counts_filtered[key] = counts

        if len(dic_word_counts_filtered) <= 2:
            dic_word_counts_filtered = {}
            for key, counts in dic_word_counts.items():
                if counts > 1:
                    dic_word_counts_filtered[key] = counts
            #if len(dic_word_counts_filtered)<=2:
            #  dic_word_counts_filtered={}
            #  for key, counts in dic_word_counts.items():
            #    dic_word_counts_filtered[key]=counts

        clus_words = list(dic_word_counts_filtered.keys())
        clus_word_counts = list(dic_word_counts_filtered.values())
        cent_Vec_words = generate_sent_vecs_toktextdata([clus_words],
                                                        wordVectorsDic,
                                                        embedDim)[0]

        dic_cluster_rep_words[predKey] = [
            dic_word_counts_filtered,
            sum(clus_word_counts)
        ]
        dic_cluster_rep_vec[predKey] = cent_Vec_words

        #print(dic_cluster_rep_words[predKey])
        #print(dic_cluster_rep_vec[predKey])

    return [dic_cluster_rep_words, dic_cluster_rep_vec]
Exemplo n.º 4
0
def populateClusterVecs(dic_nonCommon__txtIds_Clust, dic_txtId__text):
    dic_clusteVecs = {}

    for gramKey, txtIds in dic_nonCommon__txtIds_Clust.items():
        data = []
        for txtId in txtIds:
            item = dic_txtId__text[txtId]
            words = item[2]
            X = generate_sent_vecs_toktextdata([words], wordVectorsDic,
                                               embedDim)
            text_Vec = X[0]
            data.append(text_Vec)
        avg = [sum(col) / float(len(col)) for col in zip(*data)]
        dic_clusteVecs[gramKey] = avg

    return dic_clusteVecs
def clusterByWordEmbeddingIntelligent(list_pred_true_text_ind_prevind,
                                      wordVectorsDic):
    print("pred_mstreams")
    printClusterEvaluation_list(list_pred_true_text_ind_prevind)
    dic_itemGroups = groupItemsBySingleKeyIndex(
        list_pred_true_text_ind_prevind, 0)

    pred_clusters = int(len(dic_itemGroups) /
                        1.0)  #needs to be determined carefully

    dic_group_sizes = [
        len(dic_itemGroups[x]) for x in dic_itemGroups
        if isinstance(dic_itemGroups[x], list)
    ]
    print(dic_group_sizes)

    print("#clusters=" + str(pred_clusters))

    nparr = np.array(list_pred_true_text_ind_prevind)
    preds = list(nparr[:, 0])
    trues = list(nparr[:, 1])
    word_arr = list(nparr[:, 2])
    inds = list(nparr[:, 3])
    X = generate_sent_vecs_toktextdata(word_arr, wordVectorsDic, 300)
    #X=generate_sent_vecs_toktextdata_autoencoder(word_arr, wordVectorsDic, 300, pred_clusters)

    svd = TruncatedSVD(50)
    #svd = PCA(n_components=50)
    normalizer = Normalizer(copy=False)
    lsa = make_pipeline(svd, normalizer)
    #X=X.toarray()
    X = lsa.fit_transform(X)

    ward = AgglomerativeClustering(n_clusters=pred_clusters,
                                   linkage='ward').fit(X)
    list_hr_pred_true_text = combine_pred_true_txt_from_list(
        ward.labels_, trues, word_arr)
    print("hr-ward")
    printClusterEvaluation_list(list_hr_pred_true_text)

    clustering = SpectralClustering(n_clusters=pred_clusters,
                                    assign_labels="discretize",
                                    random_state=0).fit(X)
    list_sp_pred_true_text = combine_pred_true_txt_from_list(
        clustering.labels_, trues, word_arr)
    print("spectral")
    printClusterEvaluation_list(list_sp_pred_true_text)
def assignToClusterSimDistribution(not_clustered_inds_batch, dic_bitri_keys_selectedClusters_seenBatch, seen_list_pred_true_words_index, wordVectorsDic):
  
  new_not_clustered_inds_batch=[]
  
  ##follow Mstream
  dic_ClusterGroupsDetail={}  
  dic_ClusterWords={}
  dic_ClusterTextWords={}
  dic_ClusterVecs={}
  
  for key, txtInds in dic_bitri_keys_selectedClusters_seenBatch.items():
    list_pred_true_words_index=[]
    cluster_words=[]
    txtWords=[]
    vec=np.zeros(shape=[300])
    for txtInd in txtInds:
      pred= seen_list_pred_true_words_index[txtInd][0]
      true= seen_list_pred_true_words_index[txtInd][1]
      words= seen_list_pred_true_words_index[txtInd][2]
      index= seen_list_pred_true_words_index[txtInd][3]	
      list_pred_true_words_index.append([pred, true, words, index])
      cluster_words.extend(words)
      txtWords.append(words)
      sent_vec=generate_sent_vecs_toktextdata([words], wordVectorsDic, 300)[0]
      sent_vec=np.asarray(sent_vec)
      vec=np.add(vec, sent_vec)	  
	  
    dic_ClusterGroupsDetail[key]=list_pred_true_words_index
    dic_ClusterWords[key]=[Counter(cluster_words), len(cluster_words)]	
    dic_ClusterTextWords[key]=txtWords	
    dic_ClusterVecs[key]=vec # np.true_divide(vec, len(txtInds)+1)
    #print("dic_ClusterVecs[key]", dic_ClusterVecs[key])	
  
  
  ##end follow Mstream
    
  
  ####our logic starts
  keys_list=list(dic_bitri_keys_selectedClusters_seenBatch.keys())
  
  #new_clusters={}

  for item in not_clustered_inds_batch:
    word_arr=item[2]
    global_index=item[3]
    true=item[1]  

    dic_lex_Sim_CommonWords, maxPredLabel_lex, maxSim_lex, maxCommon_lex, minSim_lex=commonWordSims_clusterGroup(word_arr, dic_ClusterWords)
		
    text_Vec=generate_sent_vecs_toktextdata([word_arr], wordVectorsDic, 300)[0]		
    dic_semanticSims, maxPredLabel_Semantic, maxSim_Semantic, minSim_semantic=semanticSims(text_Vec, dic_ClusterVecs)		
        
    if maxCommon_lex>0 and str(maxPredLabel_lex)==str(maxPredLabel_Semantic):
      new_pred=str(maxPredLabel_lex)
      new_not_clustered_inds_batch.append([new_pred, true, word_arr, global_index])
      '''if len(new_pred.split(' '))==1 and new_pred.isnumeric()==True:
        #print("new_pred.isnumeric=", new_pred)
        dic_ClusterVecs[new_pred]= np.add(dic_ClusterVecs[new_pred], np.asarray(text_Vec))
        count_dic=dic_ClusterWords[new_pred][0]
        totalwords_dic=dic_ClusterWords[new_pred][1] 
        dic_ClusterWords[new_pred]=[count_dic+Counter(word_arr), totalwords_dic+len(word_arr)]'''	  
    '''else:
      new_key=str(len(dic_ClusterVecs)+10)
      new_pred=new_key
      new_not_clustered_inds_batch.append([new_pred, true, word_arr, global_index])
      dic_ClusterVecs[new_pred]=np.asarray(text_Vec)
      dic_ClusterWords[new_pred]=[Counter(word_arr), len(word_arr)]	  
      #print("new_pred=", new_pred)'''	  
	  
 	
    '''closeKey_Lexical=findCloseCluster_GramKey_lexical(keys_list,word_arr,1)
    closeKey_Semantic, max_Semantic_sim_gram=findCloseCluster_GramKey_Semantic(keys_list,word_arr,0, wordVectorsDic, False)
    if closeKey_Lexical==closeKey_Semantic:
      new_pred=str(closeKey_Lexical)
      new_not_clustered_inds_batch.append([new_pred, true, word_arr, global_index])
    else:
      closeKey_Lexical=findCloseCluster_GramKey_lexical(keys_list,word_arr,2)
      if closeKey_Lexical != None:
        new_pred=str(closeKey_Lexical)
        new_not_clustered_inds_batch.append([new_pred, true, word_arr, global_index])
      #elif max_Semantic_sim_gram>=0.8:
      #  new_pred=str(closeKey_Lexical)
      #  new_not_clustered_inds_batch.append([new_pred, true, word_arr, global_index])	  
      else:
        dic_lex_Sim_CommonWords, maxPredLabel_lex, maxSim_lex, maxCommon_lex, minSim_lex=commonWordSims_clusterGroup(word_arr, dic_ClusterWords)
		
        text_Vec=generate_sent_vecs_toktextdata([word_arr], wordVectorsDic, 300)[0]		
        dic_semanticSims, maxPredLabel_Semantic, maxSim_Semantic, minSim_semantic=semanticSims(text_Vec, dic_ClusterVecs)		
        
        if maxCommon_lex>0 and str(maxPredLabel_lex)==str(maxPredLabel_Semantic):
          new_pred=str(maxPredLabel_lex)
          new_not_clustered_inds_batch.append([new_pred, true, word_arr, global_index])
        #else: #assign to a new cluster
        #  new_key=str(len(new_clusters)	+ len(keys_list)+10)
        #  new_pred=new_key
        #  new_not_clustered_inds_batch.append([new_pred, true, word_arr, global_index])		  
        #  new_clusters.setdefault(new_key,[]).append([new_pred, true, word_arr, global_index])		
          		
          		
        #elif maxCommon_lex>=6:
        #  new_pred=str(maxPredLabel_lex)
        #  new_not_clustered_inds_batch.append([new_pred, true, word_arr, global_index])
        #elif maxSim_Semantic>=0.5:
        #  new_pred=str(maxPredLabel_Semantic)
        #  new_not_clustered_inds_batch.append([new_pred, true, word_arr, global_index])		  
        #  maxPredLabel=int(str(maxPredLabel))+1	
        #  pred_true_text_ind_prevPred[0]=str(maxPredLabel)		  
        #  new_outs.append(pred_true_text_ind_prevPred)	
    #elif closeKey_Lexical != None:
    #  new_pred=str(closeKey_Lexical)
    #  new_not_clustered_inds_batch.append([new_pred, true, word_arr, global_index])'''
    '''else:
      if closeKey_Semantic !=None:
        new_pred=str(closeKey_Semantic)
        new_not_clustered_inds_batch.append([new_pred, true, word_arr, global_index])'''  	
	
	

  return new_not_clustered_inds_batch
Exemplo n.º 7
0
def findDuplicateBySemantic(test_item):

  t11=datetime.now()

  testTruelabel= test_item[1] 
  test_words=test_item[2]	
  testpostId=test_item[4]  
  testCreateTime=test_item[5]
  testDateTime= datetime.strptime(test_item[5].split("T")[0] ,"%Y-%m-%d")

  test_X=generate_sent_vecs_toktextdata([test_words], wordVectorsDic, embedDim)
  test_text_Vec=test_X[0]	  
  
  dic_gram__sim={}
  
  for gram, center_Vec in dic_ngram__center.items():
    sim = 1-spatial.distance.cosine(center_Vec, test_text_Vec)
    dic_gram__sim[gram]=sim	
  
  list_sim=list(dic_gram__sim.values())
  sim_stdev=statistics.stdev(list_sim)
  sim_mean=statistics.mean(list_sim)
  
  all_textIds=[]  
  for gram, center_Vec in dic_ngram__center.items():
    gram_sim=dic_gram__sim[gram]
    if gram_sim>=sim_mean+sim_stdev:
      txtIds=dic_ngram__txtIds[gram]
      all_textIds.extend(txtIds)

  all_textIds=set(all_textIds) 
  ProposedHitRank=0
  print('sem-all_textIds', len(all_textIds), 'test_words', test_words)  
  for txtId in all_textIds:
    ProposedHitRank+=1	
    if ProposedHitRank > max_hitindex:
      break
	  
    train_item=dic_txtId__text[txtId] 
	  
    trainTruelabel=train_item[1]
    train_words=train_item[2]
    trainPostId=train_item[4]	
    trainCreateTime = train_item[5]	
	  
    if str(trainTruelabel)==str(testTruelabel):
    
      t12=datetime.now()	  
      t_diff = t12-t11 	
	  
      text_sim, commonCount = computeTextSimCommonWord_WordDic(Counter(test_words), Counter(train_words), len(test_words), len(train_words) )	  
      ProposedHitRank_val=int(max(1,math.floor(ProposedHitRank/len(sortedGrams))))	  
      	
      trainDateTime= datetime.strptime(train_item[5].split("T")[0] ,"%Y-%m-%d")
      date_diff=trainDateTime-testDateTime
      date_diff=date_diff.days      	  
	  
      print(str(testpostId)+"\t"+str(trainPostId)+"\t"+str(text_sim)+"\t"+str(ProposedHitRank_val)+"\t"+str(t_diff.microseconds)+"\t"+str(testTruelabel)+"\t"+' '.join(test_words)+"\t"+' '.join(train_words)+"\t"+testCreateTime+"\t"+trainCreateTime+"\t"+str(date_diff))
	  
      return True	  

    
  
  
  return False  
def cluster_biterm(f,
                   list_pred_true_words_index_postid_createtime,
                   c_bitermsFreqs={},
                   c_totalBiterms={},
                   c_wordsFreqs={},
                   c_totalWords={},
                   c_txtIds={},
                   c_clusterVecs={},
                   txtId_txt={},
                   last_txtId=0,
                   max_c_id=0,
                   wordVectorsDic={},
                   dic_clus__id={},
                   dic_biterm__clusterId_Freq={},
                   dic_biterm__allClusterFreq={},
                   dic_biterm__clusterIds={},
                   c_textItems={},
                   dic_ngram__textItems={},
                   min_gram=1,
                   max_gram=2,
                   isTagSim=True,
                   isTitleSim=False,
                   isBodySim=False):
    print("cluster_bigram")

    # current_txt_id=last_txtId

    eval_pred_true_txt = []

    line_count = 0

    t11 = datetime.now()

    for item in list_pred_true_words_index_postid_createtime:

        words = item[2]
        current_txt_id = int(item[3])
        postId = item[4]

        bi_terms = construct_biterms(words)
        grams = generateGramsConsucetive(words, min_gram, max_gram)
        # bi_terms=generateGramsConsucetive(words,minGSize, maxGSize)
        # print(words, bi_terms)

        for gram in grams:
            dic_ngram__textItems.setdefault(gram, []).append(item)

        line_count += 1

        txtBitermsFreqs = Counter(bi_terms)
        bi_terms_len = len(bi_terms)

        txtWordsFreqs = Counter(words)
        words_len = len(words)

        text_Vec = [0] * embedDim
        if isSemantic == True:
            X = generate_sent_vecs_toktextdata([words], wordVectorsDic,
                                               embedDim)
            text_Vec = X[0]

        # clusterId=findCloseCluster(c_bitermsFreqs, c_totalBiterms, c_txtIds, c_wordsFreqs, c_totalWords, c_clusterVecs, txtBitermsFreqs, bi_terms_len, txtWordsFreqs, words_len, max_c_id, text_Vec, dic_biterm__clusterIds)

        targetClusterIds = findTargetClusters(txtBitermsFreqs,
                                              dic_biterm__clusterIds)

        clusterId = findCloseClusterByTargetClusters(
            c_bitermsFreqs, c_totalBiterms, c_txtIds, c_wordsFreqs,
            c_totalWords, c_clusterVecs, txtBitermsFreqs, bi_terms_len,
            txtWordsFreqs, words_len, max_c_id, text_Vec,
            dic_biterm__clusterIds, targetClusterIds)

        c_textItems.setdefault(clusterId, []).append(item)

        max_c_id = max([max_c_id, clusterId, len(c_bitermsFreqs)])

        dic_clus__id[clusterId] = max_c_id

        txtId_txt[current_txt_id] = words

        c_bitermsFreqs, c_totalBiterms, c_txtIds, c_wordsFreqs, c_totalWords, c_clusterVecs, dic_biterm__clusterId_Freq, dic_biterm__allClusterFreq, dic_biterm__clusterIds = populateClusterFeature(
            c_bitermsFreqs, c_totalBiterms, c_txtIds, c_wordsFreqs,
            c_totalWords, c_clusterVecs, txtBitermsFreqs, bi_terms_len,
            txtWordsFreqs, words_len, clusterId, current_txt_id, text_Vec,
            dic_biterm__clusterId_Freq, dic_biterm__allClusterFreq,
            dic_biterm__clusterIds)

        # c_bitermsFreqs, c_totalBiterms, c_txtIds, dic_biterm__clusterId_Freq, dic_biterm__allClusterFreq=removeHighEntropyFtrs(c_bitermsFreqs, c_totalBiterms, c_txtIds, dic_biterm__clusterId_Freq, dic_biterm__allClusterFreq)

        # print('clusterId', clusterId, 'current_txt_id', current_txt_id, len(c_textItems), len(c_txtIds), words, len(targetClusterIds), len(dic_ngram__textItems))

        eval_pred_true_txt.append([clusterId, item[1], item[2]])
        if ignoreMinusOne == True:
            if str(item[1]) != '-1':
                f.write(
                    str(clusterId) + "	" + str(item[1]) + "	" +
                    str(' '.join(item[2])) + "	" + postId + "\n")
        else:
            f.write(
                str(clusterId) + "	" + str(item[1]) + "	" +
                str(' '.join(item[2])) + "	" + postId + "\n")

        if line_count % 500 == 0:

            # print(dic_clus__id)
            print(len(dic_clus__id))
            # delete old and small clusters, remove multi-cluster words from clusters
            list_c_sizes = []
            list_c_ids = []
            # list_size__cid={}

            for c_id, txtIds in c_txtIds.items():
                list_c_sizes.append(len(txtIds))
                list_c_ids.append(dic_clus__id[c_id])
                # list_size__cid[len(txtIds)]=c_id
            mean_c_size = 0
            std_c_size = 0
            if len(list_c_sizes) > 2:
                mean_c_size = statistics.mean(list_c_sizes)
                std_c_size = statistics.stdev(list_c_sizes)

            mean_c_id = 0
            std_c_id = 0
            if len(list_c_ids) > 2:
                mean_c_id = statistics.mean(list_c_ids)
                std_c_id = statistics.stdev(list_c_ids)

            print('preocess', line_count, 'texts', 'mean_c_size', mean_c_size,
                  'std_c_size', std_c_size)
            print('preocess', line_count, 'texts', 'mean_c_id', mean_c_id,
                  'std_c_id', std_c_id)

            list_del_cids = []
            del_count = 0

            for c_id, txtIds in c_txtIds.items():
                c_size = len(txtIds)
                if ((c_size <= 1 or
                     float(c_size) <= float(abs(mean_c_size - std_c_size))) or
                    (float(c_size) >= mean_c_size + std_c_size)) or (
                        (float(c_id) <= float(abs(mean_c_id - std_c_id))) or
                        (float(c_id) >= float(abs(mean_c_id + std_c_id)))):
                    list_del_cids.append(c_id)

            list_del_cids = set(list_del_cids)
            print('#list_del_cids', len(list_del_cids), 'len(c_bitermsFreqs)',
                  len(c_bitermsFreqs))

            listTargetBiterms = []  # need to uncomment

            for c_id in list_del_cids:

                if c_id in c_bitermsFreqs:
                    # print('del c_id', c_id, len(c_bitermsFreqs[c_id]))
                    del c_bitermsFreqs[c_id]

                if c_id in c_totalBiterms:
                    del c_totalBiterms[c_id]

                if c_id in c_txtIds:
                    del c_txtIds[c_id]

                if c_id in c_wordsFreqs:
                    del c_wordsFreqs[c_id]

                if c_id in c_totalWords:
                    del c_totalWords[c_id]

                if c_id in dic_clus__id:
                    del dic_clus__id[c_id]

                if isSemantic == True:
                    del c_clusterVecs[c_id]

            # c_bitermsFreqs, c_totalBiterms, c_txtIds, dic_biterm__clusterId_Freq, dic_biterm__allClusterFreq=removeHighEntropyFtrs(c_bitermsFreqs, c_totalBiterms, c_txtIds, dic_biterm__clusterId_Freq, dic_biterm__allClusterFreq)

        if line_count % 1000 == 0:
            print('#######-personal-eval_pred_true_txt',
                  len(eval_pred_true_txt))
            Evaluate(eval_pred_true_txt, ignoreMinusOne)

            t12 = datetime.now()
            t_diff = t12 - t11
            print("total time diff secs=", t_diff.seconds)

    last_txtId = current_txt_id
    return [
        c_bitermsFreqs, c_totalBiterms, c_wordsFreqs, c_totalWords, c_txtIds,
        c_clusterVecs, txtId_txt, last_txtId, dic_clus__id,
        dic_biterm__clusterId_Freq, dic_biterm__allClusterFreq,
        dic_biterm__clusterIds, c_textItems, dic_ngram__textItems
    ]
def cluster_biterm_framework(
        f, list_CPost, c_CFVector, max_c_id, dic_txtId__CPost, wordVectorsDic,
        dic_clus__id, dic_bitermTag__clusterIds, dic_bitermTitle__clusterIds,
        dic_bitermBody__clusterIds, dic_ngram__txtIds, min_gram, max_gram,
        oCSimilarityFlgas, c_itemsCount):
    eval_pred_true_txt = []

    line_count = 0

    t11 = datetime.now()

    for oCPost in list_CPost:

        trueLabel = oCPost.trueLabel
        tagWords = oCPost.tagWords
        titleWords = oCPost.titleWords
        bodyWords = oCPost.bodyWords
        id = oCPost.id
        soPostId = oCPost.soPostId
        createtime = oCPost.createtime

        print('id', id, 'tagWords', tagWords, 'titleWords', titleWords,
              'bodyWords', bodyWords)

        txtBitermsFreqs_Tag = None
        bi_terms_len_Tag = 0
        grams_Tag = None

        txtBitermsFreqs_Title = None
        bi_terms_len_Title = 0
        grams_Title = None

        txtBitermsFreqs_Body = None
        bi_terms_len_Body = 0
        grams_Body = None

        text_VecTag = None
        text_VecTitle = None
        text_VecBody = None
        targetClusterIds = []

        dic_txtId__CPost[id] = oCPost

        if oCSimilarityFlgas.isTagSim:
            bi_termsTag = construct_biterms(tagWords)

            grams_Tag = generateGramsConsucetive(tagWords, min_gram, max_gram)
            for gram in grams_Tag:
                if gram in dic_ngram__txtIds and len(
                        set(dic_ngram__txtIds[gram])) > max_cposts:
                    continue
                dic_ngram__txtIds.setdefault(gram, []).append(id)
            txtBitermsFreqs_Tag = Counter(bi_termsTag)
            bi_terms_len_Tag = len(bi_termsTag)
            tCIds = findTargetClusters(txtBitermsFreqs_Tag,
                                       dic_bitermTag__clusterIds)
            # print('dic_bitermTag__clusterIds', dic_bitermTag__clusterIds, 'txtBitermsFreqs_Tag', txtBitermsFreqs_Tag)
            targetClusterIds.extend(tCIds)

            if isSemantic:
                X = generate_sent_vecs_toktextdata([tagWords], wordVectorsDic,
                                                   embedDim)
                text_VecTag = X[0]

        if oCSimilarityFlgas.isTitleSim:
            bi_termsTitle = construct_biterms(titleWords)
            grams_Title = generateGramsConsucetive(titleWords, min_gram,
                                                   max_gram)
            for gram in grams_Title:
                if gram in dic_ngram__txtIds and len(
                        set(dic_ngram__txtIds[gram])) > max_cposts:
                    continue
                dic_ngram__txtIds.setdefault(gram, []).append(id)
            txtBitermsFreqs_Title = Counter(bi_termsTitle)
            bi_terms_len_Title = len(bi_termsTitle)
            tCIds = findTargetClusters(txtBitermsFreqs_Title,
                                       dic_bitermTitle__clusterIds)
            targetClusterIds.extend(tCIds)

            if isSemantic:
                X = generate_sent_vecs_toktextdata([titleWords],
                                                   wordVectorsDic, embedDim)
                text_VecTitle = X[0]

        if oCSimilarityFlgas.isBodySim:
            bi_termsBody = construct_biterms(bodyWords)
            grams_Body = generateGramsConsucetive(bodyWords, min_gram,
                                                  max_gram)
            for gram in grams_Body:
                if gram in dic_ngram__txtIds and len(
                        set(dic_ngram__txtIds[gram])) > max_cposts:
                    continue
                dic_ngram__txtIds.setdefault(gram, []).append(id)
            txtBitermsFreqs_Body = Counter(bi_termsBody)
            bi_terms_len_Body = len(bi_termsBody)
            tCIds = findTargetClusters(txtBitermsFreqs_Body,
                                       dic_bitermBody__clusterIds)
            targetClusterIds.extend(tCIds)

            if isSemantic:
                X = generate_sent_vecs_toktextdata([bodyWords], wordVectorsDic,
                                                   embedDim)
                text_VecBody = X[0]

        oCPostProcessed = CPostProcessed(txtBitermsFreqs_Tag, bi_terms_len_Tag,
                                         txtBitermsFreqs_Title,
                                         bi_terms_len_Title,
                                         txtBitermsFreqs_Body,
                                         bi_terms_len_Body, text_VecTag,
                                         text_VecTitle, text_VecBody)

        targetClusterIds = set(targetClusterIds)

        clusterId = findCloseClusterByTargetClusters_framework(
            c_CFVector, oCPostProcessed, targetClusterIds, max_c_id,
            oCSimilarityFlgas)

        if ignoreMinusOne:
            if str(trueLabel) != '-1':
                f.write(
                    str(clusterId) + "	" + str(trueLabel) + "	" +
                    ' '.join(tagWords) + "	" + str(soPostId) + "\n")
        else:
            f.write(
                str(clusterId) + "	" + str(trueLabel) + "	" +
                ' '.join(tagWords) + "	" + str(soPostId) + "\n")

        eval_pred_true_txt.append([clusterId, trueLabel, tagWords])

        if clusterId not in c_itemsCount:
            c_itemsCount[clusterId] = 0
        c_itemsCount[clusterId] += 1

        max_c_id = max([max_c_id, clusterId, len(c_CFVector)])

        dic_clus__id[clusterId] = max_c_id
        # print('max_c_id, len(c_CFVector)', max_c_id, len(c_CFVector))

        c_CFVector, dic_bitermTag__clusterIds, dic_bitermTitle__clusterIds, dic_bitermBody__clusterIds = populateClusterFeature_framework(
            c_CFVector, oCPostProcessed, dic_bitermTag__clusterIds,
            dic_bitermTitle__clusterIds, dic_bitermBody__clusterIds, clusterId,
            id, oCSimilarityFlgas)

        del oCPostProcessed
        del oCPost

        line_count += 1

        if line_count % DeleteInterval == 0:
            c_CFVector, c_itemsCount = deleteOldClusters_framework(
                c_CFVector, c_itemsCount, dic_clus__id)

        if line_count % 1000 == 0:
            # print('c_itemsCount', c_itemsCount)
            Evaluate(eval_pred_true_txt, ignoreMinusOne)

    return [
        c_CFVector, max_c_id, dic_txtId__CPost, dic_clus__id,
        dic_bitermTag__clusterIds, dic_bitermTitle__clusterIds,
        dic_bitermBody__clusterIds, dic_ngram__txtIds, c_itemsCount
    ]
Exemplo n.º 10
0
def trainLoad_cluster_biterm(trainList_pred_true_text_postid,
                             c_bitermsFreqs={},
                             c_totalBiterms={},
                             c_wordsFreqs={},
                             c_totalWords={},
                             c_txtIds={},
                             c_clusterVecs={},
                             txtId_txt={},
                             wordVectorsDic={},
                             dic_clus__id={},
                             dic_biterm__clusterIds={},
                             dic_word__clusterIds={}):
    print("train cluster_bigram")

    dicTrain_pred__trues = {}

    eval_pred_true_txt = []

    line_count = 0

    t11 = datetime.now()

    for item in trainList_pred_true_text_postid:
        pred = item[0]  #pred clusId
        true = item[1]
        words = item[2].split(' ')
        postId = item[3]
        bi_terms = construct_biterms(words)
        #bi_terms=generateGramsConsucetive(words, minGSize, maxGSize)
        #print(words, bi_terms)

        line_count += 1

        txtBitermsFreqs = Counter(bi_terms)
        bi_terms_len = len(bi_terms)

        txtWordsFreqs = Counter(words)
        words_len = len(words)

        text_Vec = [0] * embedDim
        if isSemantic == True:
            X = generate_sent_vecs_toktextdata([words], wordVectorsDic,
                                               embedDim)
            text_Vec = X[0]

        clusterId = int(pred)
        #dicTrain_pred__trues[clusterId]=int(true)
        dicTrain_pred__trues.setdefault(clusterId, []).append(int(true))

        dic_clus__id[clusterId] = clusterId
        current_txt_id = int(postId)

        txtId_txt[current_txt_id] = item

        c_bitermsFreqs, c_totalBiterms, c_txtIds, c_wordsFreqs, c_totalWords, c_clusterVecs, dic_biterm__clusterIds, dic_word__clusterIds = populateClusterFeature(
            c_bitermsFreqs, c_totalBiterms, c_txtIds, c_wordsFreqs,
            c_totalWords, c_clusterVecs, txtBitermsFreqs, bi_terms_len,
            txtWordsFreqs, words_len, clusterId, current_txt_id, text_Vec,
            dic_biterm__clusterIds, dic_word__clusterIds)

        eval_pred_true_txt.append([clusterId, item[1], item[2]])

        #if clusterId>0:
        #  print(item, bi_terms)
        #print(dic_biterm__clusterIds.keys())

        if line_count % 1000 == 0:
            print('#######-personal-eval_pred_true_txt',
                  len(eval_pred_true_txt))
            Evaluate(eval_pred_true_txt, ignoreMinusOne)

            t12 = datetime.now()
            t_diff = t12 - t11
            print("total time diff secs=", t_diff.seconds)

    return [
        c_bitermsFreqs, c_totalBiterms, c_wordsFreqs, c_totalWords, c_txtIds,
        c_clusterVecs, txtId_txt, dic_clus__id, dic_biterm__clusterIds,
        dic_word__clusterIds, dicTrain_pred__trues
    ]
def clusterByWordEmbeddingFeature(list_pred_true_text, wordVectorsDic):
    print("pred_mstreams")
    printClusterEvaluation_list(list_pred_true_text)
    dic_tupple_class = groupTxtByClass(list_pred_true_text, False)
    pred_clusters = len(dic_tupple_class)
    print("#clusters=" + str(pred_clusters))

    preds, trues, texts = split_pred_true_txt_from_list(list_pred_true_text)
    skStopWords = getScikitLearn_StopWords()
    texts = processTextsRemoveStopWordTokenized(texts, skStopWords)

    dicDocFreq = getDocFreq(texts)

    X = generate_sent_vecs_toktextdata(texts, wordVectorsDic, 300)

    #X = generate_weighted_sent_vecs_toktextdata(texts, wordVectorsDic, dicDocFreq, 300) #not good

    svd = TruncatedSVD(100)
    #svd = PCA(n_components=50)
    normalizer = Normalizer(copy=False)
    lsa = make_pipeline(svd, normalizer)
    #X=X.toarray()
    X = lsa.fit_transform(X)

    km = KMeans(n_clusters=pred_clusters,
                init='k-means++',
                max_iter=100,
                random_state=0)
    km.fit(X)
    list_km_pred_true_text = combine_pred_true_txt_from_list(
        km.labels_, trues, texts)
    print("k-means")
    printClusterEvaluation_list(list_km_pred_true_text)

    ward = AgglomerativeClustering(n_clusters=pred_clusters,
                                   linkage='ward').fit(X)
    list_hr_pred_true_text = combine_pred_true_txt_from_list(
        ward.labels_, trues, texts)
    print("hr-ward")
    printClusterEvaluation_list(list_hr_pred_true_text)

    clustering = SpectralClustering(n_clusters=pred_clusters,
                                    assign_labels="discretize",
                                    random_state=0).fit(X)
    list_sp_pred_true_text = combine_pred_true_txt_from_list(
        clustering.labels_, trues, texts)
    print("spectral")
    printClusterEvaluation_list(list_sp_pred_true_text)

    brc = Birch(branching_factor=50,
                n_clusters=pred_clusters,
                threshold=0.5,
                compute_labels=True)
    brc.fit_predict(X)
    list_brc_pred_true_text = combine_pred_true_txt_from_list(
        brc.labels_, trues, texts)
    print("brc")
    printClusterEvaluation_list(list_brc_pred_true_text)

    gmm = GaussianMixture(n_components=pred_clusters, covariance_type='full')
    gmm_labels = gmm.fit_predict(X)
    list_gmm_pred_true_text = combine_pred_true_txt_from_list(
        gmm_labels, trues, texts)
    print("gmm")
    printClusterEvaluation_list(list_gmm_pred_true_text)
    train_textdata = []

    for line in lines:
        line = line.lower().strip()
        arr = re.split("\t", line)
        train_data.append(arr[2])
        train_textdata.append(word_tokenize(arr[2]))
        train_labels.append(arr[0])
        train_trueLabels.append(arr[1])

    #vectorizer = TfidfVectorizer( max_df=1.0, min_df=1, stop_words='english', use_idf=True, smooth_idf=True, norm='l2')
    #x_train = vectorizer.fit_transform(train_data)
    gloveFile = "/home/owner/PhD/dr.norbert/dataset/shorttext/glove.42B.300d/glove.42B.300d.txt"
    termsVectorsDic = extract_word_vecs(train_textdata, gloveFile, 300)
    x_train = generate_sent_vecs_toktextdata(train_textdata, termsVectorsDic,
                                             300)

    contratio = 0.1  #len(train_data)/20000*2

    #if len(train_data)>1100:
    # contratio = len(train_data)/20000*7;

    isf = IsolationForest(n_estimators=100,
                          max_samples='auto',
                          contamination=contratio,
                          max_features=1.0,
                          bootstrap=True,
                          verbose=0,
                          random_state=0)
    outlierPreds = isf.fit(x_train).predict(x_train)
def cluster_biterm(f, list_pred_true_words_index, c_bitermsFreqs={}, c_totalBiterms={}, c_wordsFreqs={}, c_totalWords={}, c_txtIds={}, c_clusterVecs={}, txtId_txt={}, last_txtId=0, max_c_id=0, wordVectorsDic={}, dic_clus__id={}, dic_biterm__clusterId_Freq={}, dic_biterm__allClusterFreq={}, dic_biterm__clusterIds={}):
  print("cluster_bigram")

 

  current_txt_id=last_txtId	
  
  eval_pred_true_txt=[]
  
  line_count=0

  t11=datetime.now()
     
  for item in list_pred_true_words_index:
    words=item[2]
    
    bi_terms=construct_biterms(words)
    
    current_txt_id+=1

    line_count+=1	
      	
    txtBitermsFreqs=Counter(bi_terms)
    bi_terms_len= len(bi_terms)	
	
    txtWordsFreqs=Counter(words)
    words_len= len(words) 	
	
    text_Vec=[0]*embedDim 	
    if isSemantic==True:	
      X=generate_sent_vecs_toktextdata([words], wordVectorsDic, embedDim)
      text_Vec=X[0]
        	
    
	
    targetClusterIds=findTargetClusters(txtBitermsFreqs, dic_biterm__clusterIds)
          	
    clusterId=findCloseClusterByTargetClusters(c_bitermsFreqs, c_totalBiterms, c_txtIds, c_wordsFreqs, c_totalWords, c_clusterVecs, txtBitermsFreqs, bi_terms_len, txtWordsFreqs, words_len, max_c_id, text_Vec, dic_biterm__clusterIds, targetClusterIds)	
	
    	
	
    max_c_id=max([max_c_id, clusterId,len(c_bitermsFreqs)])

    dic_clus__id[clusterId]=max_c_id 	

    txtId_txt[current_txt_id]=words	

    c_bitermsFreqs, c_totalBiterms, c_txtIds, c_wordsFreqs, c_totalWords, c_clusterVecs, dic_biterm__clusterId_Freq, dic_biterm__allClusterFreq, dic_biterm__clusterIds=populateClusterFeature(c_bitermsFreqs, c_totalBiterms, c_txtIds, c_wordsFreqs, c_totalWords, c_clusterVecs, txtBitermsFreqs, bi_terms_len, txtWordsFreqs, words_len, clusterId, current_txt_id, text_Vec, dic_biterm__clusterId_Freq, dic_biterm__allClusterFreq, dic_biterm__clusterIds)
	

    
    eval_pred_true_txt.append([clusterId, item[1], item[2]])
    if ignoreMinusOne==True:
      if str(item[1])!='-1':   	
        f.write(str(clusterId)+"	"+str(item[1])+"	"+str(item[2])+"\n")
    else:
      f.write(str(clusterId)+"	"+str(item[1])+"	"+str(item[2])+"\n")  	

    
    '''if line_count%500==0:
       #remove multi-cluster biterms from c_bitermsFreqs   using targetClusterIds; before computing similarity	
       c_bitermsFreqs, c_totalBiterms, c_txtIds, txtBitermsFreqs=removeTargetMultiClusterBiTerms(c_bitermsFreqs, c_totalBiterms, c_txtIds, targetClusterIds, txtBitermsFreqs, dic_biterm__clusterIds)''' 	

    if line_count%500==0:

      #print(dic_clus__id)      
      print(len(dic_clus__id)) 	  
      #delete old and small clusters, remove multi-cluster words from clusters
      list_c_sizes=[]
      list_c_ids=[] 	  
      #list_size__cid={}
        	  
      for c_id, txtIds in c_txtIds.items():
        list_c_sizes.append(len(txtIds))
        list_c_ids.append(dic_clus__id[c_id])		
        #list_size__cid[len(txtIds)]=c_id		
      mean_c_size=statistics.mean(list_c_sizes)
      std_c_size=statistics.stdev(list_c_sizes)

      mean_c_id=statistics.mean(list_c_ids)
      std_c_id=statistics.stdev(list_c_ids)	  

      print('preocess', line_count, 'texts', 'mean_c_size', mean_c_size, 'std_c_size', std_c_size)	
      print('preocess', line_count, 'texts', 'mean_c_id', mean_c_id, 'std_c_id', std_c_id)	  
	  
      list_del_cids=[]  
      del_count=0	

	  
      '''for c_id, txtIds in c_txtIds.items():
        c_size=	len(txtIds)
        ##print('c_id=', c_id, 'c_size=', c_size)		
        #if c_size<=2 :#or del_count<15:
        #  list_del_cids.append(c_id)
        #  print('delete cluster=',c_id, '#size=', c_size) 		  		  
          #del_count+=1	  
        	  
        #if c_size<=1 or float(c_size)<=float(abs(mean_c_size-std_c_size)) or float(c_size)>=mean_c_size+std_c_size or float(c_size)>=mean_c_size:  		
        #if float(c_size)<float(abs(mean_c_size)):
        #  list_del_cids.append(c_id)
          #print('delete cluster=',c_id, '#size=', c_size)  		  
		  
        #float(c_id)<=float(abs(mean_c_id-std_c_id))		  
        if (c_size<=1 or float(c_size)<=float(abs(mean_c_size-std_c_size))) or float(c_size)>=mean_c_size: #and del_count<100:  		   		
          list_del_cids.append(c_id)
          del_count+=1
        		
        #  print('delete cluster=',c_id, '#size=', c_size) 		  
          
      #list_c_sizes.sort(reverse=True)
	  
      #for c_size in list_c_sizes[0:20]:
      #  list_del_cids.append(list_size__cid[c_size])''' 	


      for c_id, orderId in dic_clus__id.items():
        #if float(c_id)<=float(abs(mean_c_id-std_c_id)) or float(orderId)<=float(abs(mean_c_id-std_c_id)):
        if c_id not in c_txtIds:
          continue  		
        c_size=len(c_txtIds[c_id])	  
        if ( float(c_id)<=float(abs(mean_c_id-std_c_id)) or float(orderId)<=float(abs(mean_c_id-std_c_id))) and (c_size<=1 or float(c_size)<=float(abs(mean_c_size-std_c_size))):
        #or float(c_size)>=mean_c_size+std_c_size*1):		
          list_del_cids.append(c_id)  		
	  
	  
	  
		  
      print('#list_del_cids', len(list_del_cids), 'len(c_bitermsFreqs)', len(c_bitermsFreqs))


      listTargetBiterms=[]
	  
      for c_id in list_del_cids:
        BitermsFreqs=c_bitermsFreqs[c_id]  
        for biterm, freq in BitermsFreqs.items():
          if biterm not in dic_biterm__clusterIds:             
            continue			
          clusterIds=set(dic_biterm__clusterIds[biterm])
          if c_id not in clusterIds:			
            continue 			
          clusterIds.remove(c_id)				
          dic_biterm__clusterIds[biterm]=list(clusterIds)		
          if len(dic_biterm__clusterIds[biterm])==0:
            del dic_biterm__clusterIds[biterm]
			
  		
        		
	  
        del c_bitermsFreqs[c_id]
        del c_totalBiterms[c_id]
        del c_txtIds[c_id] 
        del c_wordsFreqs[c_id] 
        del c_totalWords[c_id]
        del dic_clus__id[c_id]
        if isSemantic==True:		
          del c_clusterVecs[c_id]
        		
        #for biterm, dic_clusterId__Freq in dic_biterm__clusterId_Freq.items():
        #  if c_id in dic_biterm__clusterId_Freq[biterm]:
        #    bitermClusterIdFreq=dic_biterm__clusterId_Freq[biterm][c_id]		  
            #dic_biterm__clusterId_Freq[biterm][c_id]=0	
        #    dic_biterm__allClusterFreq[biterm]-=bitermClusterIdFreq	
        #    listTargetBiterms.append(biterm) 			
        #    del dic_biterm__clusterId_Freq[biterm][c_id]
            
			
      #listTargetBiterms=set(listTargetBiterms)
      #for biterm in listTargetBiterms:
      #  if dic_biterm__allClusterFreq[biterm]<=0:
      #    del dic_biterm__clusterId_Freq[biterm]
      #    del dic_biterm__allClusterFreq[biterm] 		  
	  
      
      #c_bitermsFreqs, c_totalBiterms, c_txtIds, dic_biterm__clusterId_Freq, dic_biterm__allClusterFreq=removeHighEntropyFtrs(c_bitermsFreqs, c_totalBiterms, c_txtIds, dic_biterm__clusterId_Freq, dic_biterm__allClusterFreq)
    	
    if line_count%1000==0:  
      print('#######-personal-eval_pred_true_txt', len(eval_pred_true_txt))	 	
      Evaluate_old(eval_pred_true_txt, ignoreMinusOne)

      t12=datetime.now()	  
      t_diff = t12-t11
      print("total time diff secs=",t_diff.seconds) 	   

  last_txtId=current_txt_id
  return [c_bitermsFreqs, c_totalBiterms, c_wordsFreqs, c_totalWords, c_txtIds, c_clusterVecs, txtId_txt, last_txtId, dic_clus__id, dic_biterm__clusterId_Freq, dic_biterm__allClusterFreq, dic_biterm__clusterIds]  
    	
Exemplo n.º 14
0
def test_cluster_biterm(testList_pred_true_words_index_postid,
                        c_bitermsFreqs={},
                        c_totalBiterms={},
                        c_wordsFreqs={},
                        c_totalWords={},
                        c_txtIds={},
                        c_clusterVecs={},
                        txtId_txt={},
                        last_txtId=0,
                        max_c_id=0,
                        wordVectorsDic={},
                        dic_clus__id={},
                        dic_biterm__clusterIds={},
                        dicTrain_pred__trues={}):
    print("test cluster_bigram")

    current_txt_id = last_txtId

    eval_pred_true_txt = []

    line_count = 0

    t11 = datetime.now()

    for item in testList_pred_true_words_index_postid:
        pred = item[0]
        testTrue = int(item[1])
        words = item[2]
        postId = item[4]
        bi_terms = construct_biterms(words)
        #print(words, bi_terms, pred)

        current_txt_id += 1

        line_count += 1

        txtBitermsFreqs = Counter(bi_terms)
        bi_terms_len = len(bi_terms)

        txtWordsFreqs = Counter(words)
        words_len = len(words)

        text_Vec = [0] * embedDim
        if isSemantic == True:
            X = generate_sent_vecs_toktextdata([words], wordVectorsDic,
                                               embedDim)
            text_Vec = X[0]

        targetClusterIds = findTargetClusters(txtBitermsFreqs,
                                              dic_biterm__clusterIds)

        print(targetClusterIds)

        clusterId = findCloseClusterByTargetClusters(
            c_bitermsFreqs, c_totalBiterms, c_txtIds, c_wordsFreqs,
            c_totalWords, c_clusterVecs, txtBitermsFreqs, bi_terms_len,
            txtWordsFreqs, words_len, max_c_id, text_Vec,
            dic_biterm__clusterIds, targetClusterIds)

        if clusterId in dicTrain_pred__trues and testTrue in dicTrain_pred__trues[
                clusterId]:
            print('found found', 'clusterId', clusterId, 'testTrue', testTrue,
                  words, postId, 'len', len(dicTrain_pred__trues[clusterId]))
        else:
            print('not found', 'clusterId', clusterId, 'testTrue', testTrue,
                  words, postId)

        #max_c_id=max([max_c_id, clusterId,len(c_bitermsFreqs)])

        #dic_clus__id[clusterId]=max_c_id

        #txtId_txt[current_txt_id]=words

        #c_bitermsFreqs, c_totalBiterms, c_txtIds, c_wordsFreqs, c_totalWords, c_clusterVecs, dic_biterm__clusterIds=populateClusterFeature(c_bitermsFreqs, c_totalBiterms, c_txtIds, c_wordsFreqs, c_totalWords, c_clusterVecs, txtBitermsFreqs, bi_terms_len, txtWordsFreqs, words_len, clusterId, current_txt_id, text_Vec,  dic_biterm__clusterIds) #no need here
        '''eval_pred_true_txt.append([clusterId, item[1], item[2]])
    if ignoreMinusOne==True:
      if str(item[1])!='-1':   	
        f.write(str(clusterId)+"	"+str(item[1])+"	"+str(item[2])+"	"+postId+"\n")
    else:
      f.write(str(clusterId)+"	"+str(item[1])+"	"+str(item[2])+"	"+postId+"\n")  	

    
    if line_count%500==0:
       #remove multi-cluster biterms from c_bitermsFreqs   using targetClusterIds; before computing similarity	
       c_bitermsFreqs, c_totalBiterms, c_txtIds, txtBitermsFreqs=removeTargetMultiClusterBiTerms(c_bitermsFreqs, c_totalBiterms, c_txtIds, targetClusterIds, txtBitermsFreqs, dic_biterm__clusterIds)'''
        '''if line_count%500==0:

      #print(dic_clus__id)      
      print(len(dic_clus__id)) 	  
      #delete old and small clusters, remove multi-cluster words from clusters
      list_c_sizes=[]
      list_c_ids=[] 	  
      #list_size__cid={}
        	  
      for c_id, txtIds in c_txtIds.items():
        list_c_sizes.append(len(txtIds))
        list_c_ids.append(dic_clus__id[c_id])		
        #list_size__cid[len(txtIds)]=c_id		
      mean_c_size=statistics.mean(list_c_sizes)
      std_c_size=statistics.stdev(list_c_sizes)

      mean_c_id=statistics.mean(list_c_ids)
      std_c_id=statistics.stdev(list_c_ids)	  

      print('preocess', line_count, 'texts', 'mean_c_size', mean_c_size, 'std_c_size', std_c_size)	
      print('preocess', line_count, 'texts', 'mean_c_id', mean_c_id, 'std_c_id', std_c_id)	  
	  
      list_del_cids=[]  
      del_count=0	

	  
      	


      for c_id, orderId in dic_clus__id.items():
        #if float(c_id)<=float(abs(mean_c_id-std_c_id)) or float(orderId)<=float(abs(mean_c_id-std_c_id)):
        if c_id not in c_txtIds:
          continue  		
        c_size=len(c_txtIds[c_id])	  
        if ( float(c_id)<=float(abs(mean_c_id-std_c_id)) or float(orderId)<=float(abs(mean_c_id-std_c_id))) and (c_size<=1 or float(c_size)<=float(abs(mean_c_size-std_c_size))):
        #or float(c_size)>=mean_c_size+std_c_size*1):		
          list_del_cids.append(c_id)  		
	  
	  
	  
		  
      print('#list_del_cids', len(list_del_cids), 'len(c_bitermsFreqs)', len(c_bitermsFreqs))


      listTargetBiterms=[]
	  
      for c_id in list_del_cids:
        BitermsFreqs=c_bitermsFreqs[c_id]  
        for biterm, freq in BitermsFreqs.items():
          if biterm not in dic_biterm__clusterIds:             
            continue			
          clusterIds=set(dic_biterm__clusterIds[biterm])
          if c_id not in clusterIds:			
            continue 			
          clusterIds.remove(c_id)				
          dic_biterm__clusterIds[biterm]=list(clusterIds)		
          if len(dic_biterm__clusterIds[biterm])==0:
            del dic_biterm__clusterIds[biterm]
			
  		
        		
	  
        del c_bitermsFreqs[c_id]
        del c_totalBiterms[c_id]
        del c_txtIds[c_id] 
        del c_wordsFreqs[c_id] 
        del c_totalWords[c_id]
        del dic_clus__id[c_id]
        if isSemantic==True:		
          del c_clusterVecs[c_id]
        		
       
            
			
      
    	
    if line_count%1000==0:  
      print('#######-personal-eval_pred_true_txt', len(eval_pred_true_txt))	 	
      Evaluate(eval_pred_true_txt, ignoreMinusOne)

      t12=datetime.now()	  
      t_diff = t12-t11
      print("total time diff secs=",t_diff.seconds) '''

    last_txtId = current_txt_id
    return [
        c_bitermsFreqs, c_totalBiterms, c_wordsFreqs, c_totalWords, c_txtIds,
        c_clusterVecs, txtId_txt, last_txtId, dic_clus__id,
        dic_biterm__clusterIds
    ]
Exemplo n.º 15
0
def test_cluster_bitermMapping_buffer(
        testList_pred_true_words_index_postid_createtime,
        c_bitermsFreqs={},
        c_totalBiterms={},
        c_wordsFreqs={},
        c_totalWords={},
        c_txtIds={},
        c_clusterVecs={},
        txtId_txt={},
        last_txtId=0,
        max_c_id=0,
        wordVectorsDic={},
        dic_clus__id={},
        dic_biterm__clusterIds={},
        c_textItems={},
        dic_ngram__textItems={},
        min_gram=1,
        max_gram=2,
        max_hitindex=10000):

    eval_pred_true_txt = []

    line_count = 0

    print("testpostId" + "\t" + "trainPostId" + "\t" + "simtype" + "\t" +
          "hitranktype" + "\t" + "Proposed_hit_duration_micro" + "\t" +
          "Proposed_TestTrueLabel" + "\t" + "testText" + "\t" + "trainText" +
          "\t" + "testCreateTime" + "\t" + "TrainCreateTime" + "\t" +
          "DaysDiff")

    for item in testList_pred_true_words_index_postid_createtime:
        t11 = datetime.now()
        pred = item[0]
        testTrue = int(item[1])
        words = item[2]
        testpostId = item[4]
        testDateTime = datetime.strptime(item[5].split("t")[0],
                                         "%Y-%m-%d")  #datetime.now() # item[5]
        #print('testDateTime', item[5])
        bi_terms = construct_biterms(words)

        #print(words, bi_terms, pred)

        #current_txt_id=int(testpostId)

        line_count += 1

        txtBitermsFreqs = Counter(bi_terms)
        bi_terms_len = len(bi_terms)

        txtWordsFreqs = Counter(words)
        words_len = len(words)

        text_Vec = [0] * embedDim
        if isSemantic == True:
            X = generate_sent_vecs_toktextdata([words], wordVectorsDic,
                                               embedDim)
            text_Vec = X[0]

        #text->biterms
        #biterms->targetClusterIds
        #targetClusterIds->txtIds  by c_txtIds
        #txtIds->textItems	by txtId_txt

        targetClusterIds = findTargetClusters(txtBitermsFreqs,
                                              dic_biterm__clusterIds)
        trainItems = findTextItems(targetClusterIds, c_textItems)

        grams = generateGramsConsucetive(words, min_gram, max_gram)
        sortedGrams = list(sorted(grams, key=len, reverse=True))
        train_Items = aggregateTextItems(sortedGrams, dic_ngram__textItems)

        trainItems.extend(train_Items)

        #print('len(targetClusterIds)', len(targetClusterIds), 'len(trainItems)',len(trainItems), words)
        pathCount = 0
        flag = False
        for trainItem in trainItems:
            #list_pred_true_words_index_postid in clustring_term_online_stack=	trainItem
            trainTrue = int(trainItem[1])
            train_words = trainItem[2]
            trainPostId = trainItem[4]

            pathCount += 1

            if str(testTrue) == str(trainTrue):
                #grams=generateGramsConsucetive(words, min_gram, max_gram)
                #sortedGrams = list(sorted(grams, key = len, reverse=True))
                ProposedHitRank_val = int(
                    max(1, math.floor(pathCount / len(sortedGrams))))

                t12 = datetime.now()
                t_diff = t12 - t11
                #print(str(testpostId)+"\t"+str(trainPostId)+"\t0\t0\t0\t0\t"+str(ProposedHitRank_val)+"\t0\t"+str(t_diff.microseconds/1000000)+"\t"+str(testTrue))
                text_sim, commonCount = computeTextSimCommonWord_WordDic(
                    Counter(words), Counter(train_words), len(words),
                    len(train_words))

                trainDateTime = datetime.strptime(trainItem[5].split("t")[0],
                                                  "%Y-%m-%d")  #datetime.now()
                date_diff = trainDateTime - testDateTime
                date_diff = date_diff.days

                print(
                    str(testpostId) + "\t" + str(trainPostId) + "\t" +
                    str(text_sim) + "\t" + str(ProposedHitRank_val) + "\t" +
                    str(t_diff.microseconds / float(microDivide)) + "\t" +
                    str(testTrue) + "\t" + ' '.join(words) + "\t" +
                    ' '.join(train_words) + "\t" + str(trainDateTime) + "\t" +
                    str(testDateTime) + "\t" + str(date_diff))
                flag = True
                break

            if pathCount > max_hitindex:
                break

        if flag == False:
            '''grams=generateGramsConsucetive(words, min_gram, max_gram)		 
      sortedGrams = list(sorted(grams, key = len, reverse=True))

      flag=False  
      largestGram='' 
      ProposedHitRank=0  
       
      train_Items=aggregateTextItems(sortedGrams, dic_ngram__textItems)
      #print("len(train_Items)", len(train_Items) ) 
      for train_item in train_Items:
        ProposedHitRank+=1	
        
	  
        trainTruelabel=train_item[1]
        train_words=train_item[2]
        trainPostId=train_item[4]	

	  
        if str(trainTruelabel)==str(testTrue):
     
          t12=datetime.now()	  
          t_diff = t12-t11 	
	  
          text_sim, commonCount = computeTextSimCommonWord_WordDic(Counter(words), Counter(train_words), len(words), len(train_words) )	  
          ProposedHitRank_val=int(max(1,math.floor(ProposedHitRank/len(sortedGrams))))	  
      	
          trainDateTime= datetime.strptime(train_item[5].split("t")[0] ,"%Y-%m-%d") #datetime.now()
          date_diff=trainDateTime-testDateTime
          date_diff=date_diff.days      	  
	  
          print(str(testpostId)+"\t"+str(trainPostId)+"\t"+str(text_sim)+"\t"+str(ProposedHitRank_val)+"\t"+str(t_diff.microseconds/float(microDivide))+"\t"+str(testTrue)+"\t"+' '.join(words)+"\t"+' '.join(train_words)+"\t"+str(trainDateTime)+"\t"+str(testDateTime)+"\t"+str(date_diff)) 		
          flag=True		
          break 

        if ProposedHitRank > max_hitindex:
          break'''

            if flag == False:
                #print('not found', 'testTrue', testTrue, 'testwords', words,'postId', postId, 'pathCount', pathCount, 'len(targetClusterIds)', len(targetClusterIds))
                t12 = datetime.now()
                t_diff = t12 - t11
                #print(str(testpostId)+"\t"+"-100"+"\t0\t0\t0\t0\t-100"+"\t0\t"+str(t_diff.microseconds/1000000)+"\t"+str(testTrue))
                print(
                    str(testpostId) + "\t" + "-100" + "\t0\t" + str(-100) +
                    "\t" + str(t_diff.microseconds / float(microDivide)) +
                    "\t" + str(testTrue) + "\t" + ' '.join(words) + "\t" + "" +
                    "\t" + "" + "\t" + "" + "\t" + "")
Exemplo n.º 16
0
def test_cluster_bitermMapping(testList_pred_true_words_index_postid,
                               c_bitermsFreqs={},
                               c_totalBiterms={},
                               c_wordsFreqs={},
                               c_totalWords={},
                               c_txtIds={},
                               c_clusterVecs={},
                               txtId_txt={},
                               last_txtId=0,
                               max_c_id=0,
                               wordVectorsDic={},
                               dic_clus__id={},
                               dic_biterm__clusterIds={},
                               dic_word__clusterIds={},
                               dicTrain_pred__trues={}):
    #print("test_cluster_bitermMapping")

    eval_pred_true_txt = []

    line_count = 0

    print("testpostId" + "\t" + "trainPostId" +
          "\tTitleSim\tBodySim\tTagSim\tLuceneHitRank\t" + "ProposedHitRank" +
          "\tlucene_hit_duration\t" + "Proposed_hit_duration_micro" + "\t" +
          "Proposed_TestTrueLabel")

    for item in testList_pred_true_words_index_postid:
        t11 = datetime.now()
        pred = item[0]
        testTrue = int(item[1])
        words = item[2]
        postId = item[4]
        bi_terms = construct_biterms(words)
        #bi_terms=generateGramsConsucetive(words, minGSize, maxGSize)
        print(words, bi_terms, pred)

        current_txt_id = int(postId)

        line_count += 1

        txtBitermsFreqs = Counter(bi_terms)
        bi_terms_len = len(bi_terms)

        txtWordsFreqs = Counter(words)
        words_len = len(words)

        text_Vec = [0] * embedDim
        if isSemantic == True:
            X = generate_sent_vecs_toktextdata([words], wordVectorsDic,
                                               embedDim)
            text_Vec = X[0]

        #text->biterms
        #biterms->targetClusterIds
        #targetClusterIds->txtIds  by c_txtIds
        #txtIds->textItems	by txtId_txt

        targetClusterIds = findTargetClusters(txtBitermsFreqs,
                                              dic_biterm__clusterIds)

        print('len(targetClusterIds)', len(targetClusterIds))
        textIds = findTextIds(targetClusterIds, c_txtIds)
        print('len(textIds)', len(textIds))
        pathCount = 0
        flag = False
        for textId in textIds:
            trainItem = txtId_txt[textId]
            trainTrue = int(trainItem[1])
            trainPostId = trainItem[3]
            pathCount += 1

            if str(testTrue) == str(trainTrue):
                #print('found found', 'testTrue', testTrue, 'testwords', words,'postId', postId, 'pathCount', pathCount, 'len(targetClusterIds)', len(targetClusterIds))
                t12 = datetime.now()
                t_diff = t12 - t11
                print(
                    str(postId) + "\t" + str(trainPostId) + "\t0\t0\t0\t0\t" +
                    str(len(targetClusterIds)) + "\t0\t" +
                    str(t_diff.microseconds) + "\t" + str(testTrue))
                flag = True
                break

            if pathCount > max_hitindex:
                break

        if flag == False:
            '''targetClusterIds=findTargetClusters(txtWordsFreqs, dic_word__clusterIds)
      textIds=findTextIds(targetClusterIds, c_txtIds)
      pathCount=0	
      flag=False	
      for textId in textIds:
        trainItem = txtId_txt[textId] 
        trainTrue=int(trainItem[1])	
        trainPostId=trainItem[3]			
        pathCount+=1
        if str(testTrue) == str(trainTrue):      	  
          #print('found found', 'testTrue', testTrue, 'testwords', words,'postId', postId, 'pathCount', pathCount, 'len(targetClusterIds)', len(targetClusterIds))	
          t12=datetime.now()	  
          t_diff = t12-t11		
          print(str(postId)+"\t"+str(trainPostId)+"\t0\t0\t0\t0\t"+str(len(targetClusterIds))+"\t0\t"+str(t_diff.microseconds)+"\t"+str(testTrue)) 		  
          flag=True
          break	'''

            if flag == False:
                #print('not found', 'testTrue', testTrue, 'testwords', words,'postId', postId, 'pathCount', pathCount, 'len(targetClusterIds)', len(targetClusterIds))
                t12 = datetime.now()
                t_diff = t12 - t11
                print(
                    str(postId) + "\t" + "-100" + "\t0\t0\t0\t0\t-100" +
                    "\t0\t" + str(t_diff.microseconds) + "\t" + str(testTrue))
def clusteringDCT(pred_true_txt_ind_prevPreds, wordVectorsDic, batchDocs,
                  maxPredLabel):
    print("#m-stream-cleaned")
    Evaluate(pred_true_txt_ind_prevPreds)

    pred_true_text_ind_prevPreds_to_cluster, pred_true_text_ind_prevPreds_to_not_cluster = extrcatLargeClusterItems(
        pred_true_txt_ind_prevPreds)
    print("3 rd=" + str(pred_true_text_ind_prevPreds_to_cluster[0][3]))
    print("4 rd=" + str(pred_true_text_ind_prevPreds_to_cluster[0][4]))
    '''minPredToC, maxPredToC, minTrueToC, maxTrueToC=findMinMaxLabel(pred_true_text_ind_prevPreds_to_cluster)
  print("minPred, maxPred, minTrue, maxTrue=(pred_true_text_ind_prevPreds_to_cluster)") 
  print(minPredToC, maxPredToC, minTrueToC, maxTrueToC)
  
  minPredToNC, maxPredToNC, minTrueToNC, maxTrueToNC=findMinMaxLabel(pred_true_text_ind_prevPreds_to_not_cluster)
  print("minPred, maxPred, minTrue, maxTrue=(pred_true_text_ind_prevPreds_to_not_cluster)") 
  print(minPredToNC, maxPredToNC, minTrueToNC, maxTrueToNC)'''

    all_pred_clusters = len(groupTxtByClass(pred_true_txt_ind_prevPreds,
                                            False))
    pred_clusters = len(
        groupTxtByClass(pred_true_text_ind_prevPreds_to_cluster, False))
    non_pred_clusters = len(
        groupTxtByClass(pred_true_text_ind_prevPreds_to_not_cluster, False))

    print("#clusters=" + str(pred_clusters))
    print("#not clusters=" + str(non_pred_clusters))
    print("this clustering with embedding DCT")
    pred_clusters = non_pred_clusters - pred_clusters
    print("#update clusters=" + str(pred_clusters))

    nparr = np.array(pred_true_text_ind_prevPreds_to_cluster)
    print("3 rd=" + str(pred_true_text_ind_prevPreds_to_cluster[0][3]))
    print("4 rd=" + str(pred_true_text_ind_prevPreds_to_cluster[0][4]))
    preds = list(nparr[:, 0])
    trues = list(nparr[:, 1])
    texts = list(nparr[:, 2])
    inds = list(nparr[:, 3])
    prevPreds = list(nparr[:, 4])

    skStopWords = getScikitLearn_StopWords()
    texts = processTextsRemoveStopWordTokenized(texts, skStopWords)
    '''dicDocFreq=getDocFreq(texts)
  dctCoffs=1
  X=generate_sent_vecs_toktextdata_DCT(texts, wordVectorsDic, 300,dctCoffs)  
  #vectorizer = TfidfVectorizer(tokenizer=stem_text,max_df=0.5,min_df=1)
  #vectorizer = TfidfVectorizer(max_df=0.5,min_df=2, stop_words='english')
  #X = vectorizer.fit_transform(texts)'''
    '''svd = TruncatedSVD(50)
  #svd = PCA(n_components=50)	
  normalizer = Normalizer(copy=False)
  lsa = make_pipeline(svd, normalizer)
  #X=X.toarray()	
  X = lsa.fit_transform(X)'''
    '''km = KMeans(n_clusters=pred_clusters, init='k-means++', max_iter=100,random_state=0)	
  km.fit(X)
  list_km_pred_true_text=combine_pred_true_txt_from_list(km.labels_, trues, texts)
  print("#k-means")	
  Evaluate(list_km_pred_true_text)'''
    '''ward = AgglomerativeClustering(n_clusters=pred_clusters, linkage='ward').fit(X)
  list_hr_pred_true_text=combine_pred_true_txt_from_list(ward.labels_, trues, texts)
  print("#hr-ward-DCT")
  print(min(ward.labels_), max(ward.labels_))
  pred_true_text_ind_prevPreds_to_not_cluster_hr=change_pred_label(pred_true_text_ind_prevPreds_to_not_cluster, pred_clusters+1)  
  Evaluate(list_hr_pred_true_text)
  Evaluate(list_hr_pred_true_text+pred_true_text_ind_prevPreds_to_not_cluster_hr)
  '''

    X = generate_sent_vecs_toktextdata(texts, wordVectorsDic, 300)
    ward = AgglomerativeClustering(n_clusters=pred_clusters,
                                   linkage='ward').fit(X)
    list_hr_pred_true_text_ind_prevPred = np.column_stack(
        (ward.labels_, trues, texts, inds, prevPreds)).tolist()
    print("#hr-ward-AVG")
    pred_true_text_ind_prevPreds_to_not_cluster_hr = change_pred_label(
        pred_true_text_ind_prevPreds_to_not_cluster, pred_clusters + 1)
    Evaluate(list_hr_pred_true_text_ind_prevPred)
    Evaluate(list_hr_pred_true_text_ind_prevPred +
             pred_true_text_ind_prevPreds_to_not_cluster_hr)
    #print_by_group(list_hr_pred_true_text+pred_true_text_ind_prevPreds_to_not_cluster_hr)

    print("#spectral-avg")
    clustering = SpectralClustering(n_clusters=pred_clusters,
                                    assign_labels="discretize",
                                    random_state=0).fit(X)
    list_sp_pred_true_text_ind_prevPred = np.column_stack(
        (clustering.labels_, trues, texts, inds, prevPreds)).tolist()
    pred_true_text_ind_prevPreds_to_not_cluster_spec = change_pred_label(
        pred_true_text_ind_prevPreds_to_not_cluster, pred_clusters + 1)
    Evaluate(list_sp_pred_true_text_ind_prevPred)
    Evaluate(list_sp_pred_true_text_ind_prevPred +
             pred_true_text_ind_prevPreds_to_not_cluster_spec)
Exemplo n.º 18
0
def test_cluster_bitermMapping_buffer_framework(
        list_CPost_test, c_CFVector, dic_txtId__CPost,
        dic_bitermTag__clusterIds, dic_bitermTitle__clusterIds,
        dic_bitermBody__clusterIds, dic_ngram__txtIds, min_gram, max_gram,
        max_hitindex, oCSimilarityFlgas, wordVectorsDic):
    eval_pred_true_txt = []

    line_count = 0

    fileWrite = open(outfileName, 'w')

    fileWrite.write("testpostId" + "\t" + "trainPostId" + "\t" + "similarity" +
                    "\t" + "Proposed_hitrank" + "\t" +
                    "Proposed_hit_duration_micro" + "\t" +
                    "Proposed_TestTrueLabel" + "\t" + "testText" + "\t" +
                    "trainText" + "\t" + "testCreateTime" + "\t" +
                    "TrainCreateTime" + "\t" + "DaysDiff" + "\t" +
                    "OriginalRank" + "\n")
    print("testpostId" + "\t" + "trainPostId" + "\t" + "similarity" + "\t" +
          "Proposed_hitrank" + "\t" + "Proposed_hit_duration_micro" + "\t" +
          "Proposed_TestTrueLabel" + "\t" + "testText" + "\t" + "trainText" +
          "\t" + "testCreateTime" + "\t" + "TrainCreateTime" + "\t" +
          "DaysDiff" + "\t" + "OriginalRank")

    for oCPost in list_CPost_test:
        t11 = datetime.now()

        testTrue = oCPost.trueLabel
        tagWords = oCPost.tagWords
        titleWords = oCPost.titleWords
        bodyWords = oCPost.bodyWords
        # id = oCPost.id  # may not be useful for test
        testpostId = oCPost.soPostId
        testCreatetime = oCPost.createtime

        testWords = tagWords  # this can be changed

        txtBitermsFreqs_Tag = None
        bi_terms_len_Tag = 0
        grams_Tag = None

        txtBitermsFreqs_Title = None
        bi_terms_len_Title = 0
        grams_Title = None

        txtBitermsFreqs_Body = None
        bi_terms_len_Body = 0
        grams_Body = None

        text_VecTag = None
        text_VecTitle = None
        text_VecBody = None
        targetClusterIds = []
        grams = []

        line_count += 1

        # text->biterms
        # biterms->targetClusterIds
        # targetClusterIds->txtIds  by c_txtIds
        # txtIds->textItems	by txtId_txt

        if oCSimilarityFlgas.isTagSim:
            bi_termsTag = construct_biterms(tagWords)
            grams_Tag = generateGramsConsucetive(tagWords, min_gram, max_gram)
            grams.extend(grams_Tag)

            txtBitermsFreqs_Tag = Counter(bi_termsTag)
            bi_terms_len_Tag = len(bi_termsTag)
            tCIds = findTargetClusters(txtBitermsFreqs_Tag,
                                       dic_bitermTag__clusterIds)
            targetClusterIds.extend(tCIds)

            if isSemantic:
                X = generate_sent_vecs_toktextdata([tagWords], wordVectorsDic,
                                                   embedDim)
                text_VecTag = X[0]

        if oCSimilarityFlgas.isTitleSim:
            bi_termsTitle = construct_biterms(titleWords)
            grams_Title = generateGramsConsucetive(titleWords, min_gram,
                                                   max_gram)
            grams.extend(grams_Title)

            txtBitermsFreqs_Title = Counter(bi_termsTitle)
            bi_terms_len_Title = len(bi_termsTitle)
            tCIds = findTargetClusters(txtBitermsFreqs_Title,
                                       dic_bitermTitle__clusterIds)
            targetClusterIds.extend(tCIds)

            if isSemantic:
                X = generate_sent_vecs_toktextdata([titleWords],
                                                   wordVectorsDic, embedDim)
                text_VecTitle = X[0]

        if oCSimilarityFlgas.isBodySim:
            bi_termsBody = construct_biterms(bodyWords)
            grams_Body = generateGramsConsucetive(bodyWords, min_gram,
                                                  max_gram)
            grams.extend(grams_Body)

            txtBitermsFreqs_Body = Counter(bi_termsBody)
            bi_terms_len_Body = len(bi_termsBody)
            tCIds = findTargetClusters(txtBitermsFreqs_Body,
                                       dic_bitermBody__clusterIds)
            targetClusterIds.extend(tCIds)

            if isSemantic:
                X = generate_sent_vecs_toktextdata([bodyWords], wordVectorsDic,
                                                   embedDim)
                text_VecBody = X[0]

        oCPostProcessed = CPostProcessed(txtBitermsFreqs_Tag, bi_terms_len_Tag,
                                         txtBitermsFreqs_Title,
                                         bi_terms_len_Title,
                                         txtBitermsFreqs_Body,
                                         bi_terms_len_Body, text_VecTag,
                                         text_VecTitle, text_VecBody)

        targetClusterIds = set(targetClusterIds)
        closeClusterIds = findCloseClustersIds_framework(
            oCPostProcessed, targetClusterIds, c_CFVector, oCSimilarityFlgas)
        train_cluster_CPosts = findTextItems_framework(closeClusterIds,
                                                       c_CFVector,
                                                       dic_txtId__CPost)
        # train_cluster_CPosts = filterTextItems_framework(train_cluster_CPosts, oCSimilarityFlgas, oCPostProcessed)

        sortedGrams = list(sorted(grams, key=len, reverse=True))
        train_gram_CPosts = aggregateTextItems_framework(
            sortedGrams, dic_ngram__txtIds, dic_txtId__CPost)

        train_gram_CPosts.extend(train_cluster_CPosts)

        # train_Items.extend(trainItems)

        # print('len(train_gram_CPosts)', len(train_gram_CPosts), 'len(targetClusterIds)', len(targetClusterIds))
        pathCount = 0
        flag = False
        for trainCPost in train_gram_CPosts:
            trainTrue = int(str(trainCPost.trueLabel))
            train_words = trainCPost.tagWords  # this can be changed
            trainPostId = trainCPost.soPostId
            trainCreateTime = trainCPost.createtime

            pathCount += 1

            if str(testTrue) == str(trainTrue):
                ProposedHitRank_val = int(
                    max(1, math.floor(pathCount / len(sortedGrams))))

                t12 = datetime.now()
                t_diff = t12 - t11
                text_sim, commonCount = computeTextSimCommonWord_WordDic(
                    Counter(testWords), Counter(train_words), len(testWords),
                    len(train_words))

                date_diff = trainCreateTime - testCreatetime
                date_diff = date_diff.days

                # "testpostId" + "\t" + "trainPostId" + "\t" + "simtype" + "\t" + "hitranktype" + "\t" + "Proposed_hit_duration_micro" + "\t" + "Proposed_TestTrueLabel" + "\t" + "testText" + "\t" + "trainText" + "\t" + "testCreateTime" + "\t" + "TrainCreateTime" + "\t" + "DaysDiff" + "\t" + "OriginalRank"
                print(
                    str(testpostId) + "\t" + str(trainPostId) + "\t" +
                    str(text_sim) + "\t" + str(ProposedHitRank_val) + "\t" +
                    str(t_diff.microseconds / float(microDivide)) + "\t" +
                    str(testTrue) + "\t" + ' '.join(testWords) + "\t" +
                    ' '.join(train_words) + "\t" + str(testCreatetime) + "\t" +
                    str(trainCreateTime) + "\t" + str(date_diff) + "\t" +
                    str(pathCount))

                fileWrite.write(
                    str(testpostId) + "\t" + str(trainPostId) + "\t" +
                    str(text_sim) + "\t" + str(ProposedHitRank_val) + "\t" +
                    str(t_diff.microseconds / float(microDivide)) + "\t" +
                    str(testTrue) + "\t" + ' '.join(testWords) + "\t" +
                    ' '.join(train_words) + "\t" + str(testCreatetime) + "\t" +
                    str(trainCreateTime) + "\t" + str(date_diff) + "\t" +
                    str(pathCount) + "\n")

                flag = True
                break

            if pathCount > max_hitindex:
                break

        if not flag:
            t12 = datetime.now()
            t_diff = t12 - t11
            print(
                str(testpostId) + "\t" + "-100" + "\t0\t" + str(-100) + "\t" +
                str(t_diff.microseconds / float(microDivide)) + "\t" +
                str(testTrue) + "\t" + ' '.join(testWords) + "\t" + "" + "\t" +
                "" + "\t" + "" + "\t" + "" + "\t" + "")

            fileWrite.write(
                str(testpostId) + "\t" + "-100" + "\t0\t" + str(-100) + "\t" +
                str(t_diff.microseconds / float(microDivide)) + "\t" +
                str(testTrue) + "\t" + ' '.join(testWords) + "\t" + "" + "\t" +
                "" + "\t" + "" + "\t" + "" + "\t" + "" + "\n")

    fileWrite.close()