def cluster_gram_freq(list_docs, minGSize, maxGSize):
    dic_ngram__docs = {}

    # set_docIds = [document.documentID for document in list_docs]
    # set_docIds = set(set_docIds)

    for document in list_docs:
        words = document.text
        grams = generateGramsConsucetive(words, minGSize, maxGSize)

        for gram in grams:
            dic_ngram__docs.setdefault(gram, []).append(document)

        # print('cluster_gram_freq', words, document.documentID, grams, len(dic_ngram__docs))

    gram_std, gram_mean, gram_max, gram_min = populateNgramStatistics(
        dic_ngram__docs, 1)
    print('gram_std, gram_mean, gram_max, gram_min', gram_std, gram_mean,
          gram_max, gram_min, 'before len(dic_ngram__docs)',
          len(dic_ngram__docs))

    minClusterSize = gram_mean + 0 * gram_std
    dic_filtered_ngram__docs = filterGrams(dic_ngram__docs, minClusterSize)
    print('after len(dic_filtered_ngram__docs)', len(dic_filtered_ngram__docs))
    dic_removed_common__docs = removeCommonDocs(dic_filtered_ngram__docs)
    print('after dic_removed_common__docs', len(dic_removed_common__docs))
    print('###total docs in batch=', len(list_docs))
    dic_docId__cluster = evaluateByGramUsingDic(dic_removed_common__docs)

    del dic_removed_common__docs
    del dic_filtered_ngram__docs
    del dic_ngram__docs

    # return [dic_docId__cluster, set_docIds - dic_docId__cluster.keys()]
    return dic_docId__cluster
Exemplo n.º 2
0
def buildNGramIndex(list_pred_true_words_index_postid_createtime):
  for item in list_pred_true_words_index_postid_createtime:
    words=item[2]	 
    txtId=item[3] 
    #print('process index for', item)	
	
    text_Vec=None
	
    if isSemantic==True:	
      if txtId in dic_txtId__vec:
        text_Vec=	dic_txtId__vec[txtId]
      else:
        X=generate_sent_vecs_toktextdata([words], wordVectorsDic, embedDim)
        text_Vec=X[0] 

      dic_txtId__vec[txtId]=text_Vec	  
	
    	
	
    dic_txtId__text[txtId]=item     
	
    grams=generateGramsConsucetive(words,min_gram,max_gram) #len(words))	
    
    for gram in grams:
      dic_ngram__txtIds.setdefault(gram, []).append(txtId)

      
      if isSemantic==True:	  
        if gram in dic_ngram__center:
          dic_ngram__center[gram]=list( map(add, dic_ngram__center[gram], text_Vec) )
        else:
          dic_ngram__center[gram]=text_Vec
def buildNGramIndex(list_pred_true_words_index_postid):
    for item in list_pred_true_words_index_postid:
        words = item[2]
        txtId = item[3]

        dic_txtId__text[txtId] = item

        #grams=generateGrams(words,min_gram,len(words))
        grams = generateGramsConsucetive(words, min_gram, max_gram)

        for gram in grams:
            dic_ngram__txtIds.setdefault(gram, []).append(txtId)
def buildNGramIndex(list_pred_true_words_index_postid_createtime):
    dic_ngram__txtIds = {}
    dic_txtId__text = {}

    for item in list_pred_true_words_index_postid_createtime:
        words = item[2]
        txtId = item[3]  # index

        dic_txtId__text[txtId] = item

        grams = generateGramsConsucetive(words, min_gram,
                                         max_gram)  # len(words))

        for gram in grams:
            dic_ngram__txtIds.setdefault(gram, []).append(txtId)

    return [dic_ngram__txtIds, dic_txtId__text]
def gramClusterToFeatures(dic_gram__txtIds, dic_txtId__text):
    dic_term_clusterGramIds = {}
    dic_cluster_ftrs = {}
    dic_cluster_size = {}

    for gramClusterID, txtIds in dic_gram__txtIds.items(
    ):  # gram is a cluster id
        cluster_ftrs = []
        for txtId in txtIds:
            item = dic_txtId__text[txtId]
            words = item[2]
            ftrs = generateGramsConsucetive(words, min_gram,
                                            max_gram)  # len(words))
            cluster_ftrs.extend(ftrs)
            for ftr in ftrs:
                dic_term_clusterGramIds.setdefault(ftr,
                                                   []).append(gramClusterID)

        ftr_dict = Counter(cluster_ftrs)
        dic_cluster_ftrs[gramClusterID] = ftr_dict
        dic_cluster_size[gramClusterID] = len(cluster_ftrs)

    return [dic_term_clusterGramIds, dic_cluster_ftrs, dic_cluster_size]
testfile = 'test_stackoverflow_' + lang + '_true_id_title_tags_body_createtime'
testList_pred_true_words_index_postid_createtime = readStackOverflowDataSetBody(
    testfile, isStopWord, 6, textType, tagIgnore)
count = 0
for item in testList_pred_true_words_index_postid_createtime:
    testTruelabel = item[1]
    words = item[2]
    testpostId = item[4]
    testCreateTime = item[5]

    testDateTime = datetime.strptime(str(item[5]).split("t")[0], "%Y-%m-%d")

    t11 = datetime.now()

    test_grams = generateGramsConsucetive(words, min_gram,
                                          max_gram)  # len(words))
    test_term_dict = Counter(test_grams)
    test_term_size = len(test_grams)

    tagetGramClusterIds = getTagetGramClusterIds(test_grams,
                                                 dic_term_clusterGramIds)
    count += 1
    print('count', count, 'len(tagetGramClusterIds)', len(tagetGramClusterIds))

    dict_cluster_sims = {}
    for gramClusterId in tagetGramClusterIds:
        # print('clusterId', clusterId, 'len(dic_tupple_class[clusterId])', len(dic_tupple_class[clusterId]))
        sim, commCount = computeTextSimCommonWord_WordDic(
            test_term_dict, dic_cluster_ftrs[gramClusterId], test_term_size,
            dic_cluster_size[gramClusterId])
def cluster_biterm(f,
                   list_pred_true_words_index_postid_createtime,
                   c_bitermsFreqs={},
                   c_totalBiterms={},
                   c_wordsFreqs={},
                   c_totalWords={},
                   c_txtIds={},
                   c_clusterVecs={},
                   txtId_txt={},
                   last_txtId=0,
                   max_c_id=0,
                   wordVectorsDic={},
                   dic_clus__id={},
                   dic_biterm__clusterId_Freq={},
                   dic_biterm__allClusterFreq={},
                   dic_biterm__clusterIds={},
                   c_textItems={},
                   dic_ngram__textItems={},
                   min_gram=1,
                   max_gram=2,
                   isTagSim=True,
                   isTitleSim=False,
                   isBodySim=False):
    print("cluster_bigram")

    # current_txt_id=last_txtId

    eval_pred_true_txt = []

    line_count = 0

    t11 = datetime.now()

    for item in list_pred_true_words_index_postid_createtime:

        words = item[2]
        current_txt_id = int(item[3])
        postId = item[4]

        bi_terms = construct_biterms(words)
        grams = generateGramsConsucetive(words, min_gram, max_gram)
        # bi_terms=generateGramsConsucetive(words,minGSize, maxGSize)
        # print(words, bi_terms)

        for gram in grams:
            dic_ngram__textItems.setdefault(gram, []).append(item)

        line_count += 1

        txtBitermsFreqs = Counter(bi_terms)
        bi_terms_len = len(bi_terms)

        txtWordsFreqs = Counter(words)
        words_len = len(words)

        text_Vec = [0] * embedDim
        if isSemantic == True:
            X = generate_sent_vecs_toktextdata([words], wordVectorsDic,
                                               embedDim)
            text_Vec = X[0]

        # clusterId=findCloseCluster(c_bitermsFreqs, c_totalBiterms, c_txtIds, c_wordsFreqs, c_totalWords, c_clusterVecs, txtBitermsFreqs, bi_terms_len, txtWordsFreqs, words_len, max_c_id, text_Vec, dic_biterm__clusterIds)

        targetClusterIds = findTargetClusters(txtBitermsFreqs,
                                              dic_biterm__clusterIds)

        clusterId = findCloseClusterByTargetClusters(
            c_bitermsFreqs, c_totalBiterms, c_txtIds, c_wordsFreqs,
            c_totalWords, c_clusterVecs, txtBitermsFreqs, bi_terms_len,
            txtWordsFreqs, words_len, max_c_id, text_Vec,
            dic_biterm__clusterIds, targetClusterIds)

        c_textItems.setdefault(clusterId, []).append(item)

        max_c_id = max([max_c_id, clusterId, len(c_bitermsFreqs)])

        dic_clus__id[clusterId] = max_c_id

        txtId_txt[current_txt_id] = words

        c_bitermsFreqs, c_totalBiterms, c_txtIds, c_wordsFreqs, c_totalWords, c_clusterVecs, dic_biterm__clusterId_Freq, dic_biterm__allClusterFreq, dic_biterm__clusterIds = populateClusterFeature(
            c_bitermsFreqs, c_totalBiterms, c_txtIds, c_wordsFreqs,
            c_totalWords, c_clusterVecs, txtBitermsFreqs, bi_terms_len,
            txtWordsFreqs, words_len, clusterId, current_txt_id, text_Vec,
            dic_biterm__clusterId_Freq, dic_biterm__allClusterFreq,
            dic_biterm__clusterIds)

        # c_bitermsFreqs, c_totalBiterms, c_txtIds, dic_biterm__clusterId_Freq, dic_biterm__allClusterFreq=removeHighEntropyFtrs(c_bitermsFreqs, c_totalBiterms, c_txtIds, dic_biterm__clusterId_Freq, dic_biterm__allClusterFreq)

        # print('clusterId', clusterId, 'current_txt_id', current_txt_id, len(c_textItems), len(c_txtIds), words, len(targetClusterIds), len(dic_ngram__textItems))

        eval_pred_true_txt.append([clusterId, item[1], item[2]])
        if ignoreMinusOne == True:
            if str(item[1]) != '-1':
                f.write(
                    str(clusterId) + "	" + str(item[1]) + "	" +
                    str(' '.join(item[2])) + "	" + postId + "\n")
        else:
            f.write(
                str(clusterId) + "	" + str(item[1]) + "	" +
                str(' '.join(item[2])) + "	" + postId + "\n")

        if line_count % 500 == 0:

            # print(dic_clus__id)
            print(len(dic_clus__id))
            # delete old and small clusters, remove multi-cluster words from clusters
            list_c_sizes = []
            list_c_ids = []
            # list_size__cid={}

            for c_id, txtIds in c_txtIds.items():
                list_c_sizes.append(len(txtIds))
                list_c_ids.append(dic_clus__id[c_id])
                # list_size__cid[len(txtIds)]=c_id
            mean_c_size = 0
            std_c_size = 0
            if len(list_c_sizes) > 2:
                mean_c_size = statistics.mean(list_c_sizes)
                std_c_size = statistics.stdev(list_c_sizes)

            mean_c_id = 0
            std_c_id = 0
            if len(list_c_ids) > 2:
                mean_c_id = statistics.mean(list_c_ids)
                std_c_id = statistics.stdev(list_c_ids)

            print('preocess', line_count, 'texts', 'mean_c_size', mean_c_size,
                  'std_c_size', std_c_size)
            print('preocess', line_count, 'texts', 'mean_c_id', mean_c_id,
                  'std_c_id', std_c_id)

            list_del_cids = []
            del_count = 0

            for c_id, txtIds in c_txtIds.items():
                c_size = len(txtIds)
                if ((c_size <= 1 or
                     float(c_size) <= float(abs(mean_c_size - std_c_size))) or
                    (float(c_size) >= mean_c_size + std_c_size)) or (
                        (float(c_id) <= float(abs(mean_c_id - std_c_id))) or
                        (float(c_id) >= float(abs(mean_c_id + std_c_id)))):
                    list_del_cids.append(c_id)

            list_del_cids = set(list_del_cids)
            print('#list_del_cids', len(list_del_cids), 'len(c_bitermsFreqs)',
                  len(c_bitermsFreqs))

            listTargetBiterms = []  # need to uncomment

            for c_id in list_del_cids:

                if c_id in c_bitermsFreqs:
                    # print('del c_id', c_id, len(c_bitermsFreqs[c_id]))
                    del c_bitermsFreqs[c_id]

                if c_id in c_totalBiterms:
                    del c_totalBiterms[c_id]

                if c_id in c_txtIds:
                    del c_txtIds[c_id]

                if c_id in c_wordsFreqs:
                    del c_wordsFreqs[c_id]

                if c_id in c_totalWords:
                    del c_totalWords[c_id]

                if c_id in dic_clus__id:
                    del dic_clus__id[c_id]

                if isSemantic == True:
                    del c_clusterVecs[c_id]

            # c_bitermsFreqs, c_totalBiterms, c_txtIds, dic_biterm__clusterId_Freq, dic_biterm__allClusterFreq=removeHighEntropyFtrs(c_bitermsFreqs, c_totalBiterms, c_txtIds, dic_biterm__clusterId_Freq, dic_biterm__allClusterFreq)

        if line_count % 1000 == 0:
            print('#######-personal-eval_pred_true_txt',
                  len(eval_pred_true_txt))
            Evaluate(eval_pred_true_txt, ignoreMinusOne)

            t12 = datetime.now()
            t_diff = t12 - t11
            print("total time diff secs=", t_diff.seconds)

    last_txtId = current_txt_id
    return [
        c_bitermsFreqs, c_totalBiterms, c_wordsFreqs, c_totalWords, c_txtIds,
        c_clusterVecs, txtId_txt, last_txtId, dic_clus__id,
        dic_biterm__clusterId_Freq, dic_biterm__allClusterFreq,
        dic_biterm__clusterIds, c_textItems, dic_ngram__textItems
    ]
def cluster_biterm_framework(
        f, list_CPost, c_CFVector, max_c_id, dic_txtId__CPost, wordVectorsDic,
        dic_clus__id, dic_bitermTag__clusterIds, dic_bitermTitle__clusterIds,
        dic_bitermBody__clusterIds, dic_ngram__txtIds, min_gram, max_gram,
        oCSimilarityFlgas, c_itemsCount):
    eval_pred_true_txt = []

    line_count = 0

    t11 = datetime.now()

    for oCPost in list_CPost:

        trueLabel = oCPost.trueLabel
        tagWords = oCPost.tagWords
        titleWords = oCPost.titleWords
        bodyWords = oCPost.bodyWords
        id = oCPost.id
        soPostId = oCPost.soPostId
        createtime = oCPost.createtime

        print('id', id, 'tagWords', tagWords, 'titleWords', titleWords,
              'bodyWords', bodyWords)

        txtBitermsFreqs_Tag = None
        bi_terms_len_Tag = 0
        grams_Tag = None

        txtBitermsFreqs_Title = None
        bi_terms_len_Title = 0
        grams_Title = None

        txtBitermsFreqs_Body = None
        bi_terms_len_Body = 0
        grams_Body = None

        text_VecTag = None
        text_VecTitle = None
        text_VecBody = None
        targetClusterIds = []

        dic_txtId__CPost[id] = oCPost

        if oCSimilarityFlgas.isTagSim:
            bi_termsTag = construct_biterms(tagWords)

            grams_Tag = generateGramsConsucetive(tagWords, min_gram, max_gram)
            for gram in grams_Tag:
                if gram in dic_ngram__txtIds and len(
                        set(dic_ngram__txtIds[gram])) > max_cposts:
                    continue
                dic_ngram__txtIds.setdefault(gram, []).append(id)
            txtBitermsFreqs_Tag = Counter(bi_termsTag)
            bi_terms_len_Tag = len(bi_termsTag)
            tCIds = findTargetClusters(txtBitermsFreqs_Tag,
                                       dic_bitermTag__clusterIds)
            # print('dic_bitermTag__clusterIds', dic_bitermTag__clusterIds, 'txtBitermsFreqs_Tag', txtBitermsFreqs_Tag)
            targetClusterIds.extend(tCIds)

            if isSemantic:
                X = generate_sent_vecs_toktextdata([tagWords], wordVectorsDic,
                                                   embedDim)
                text_VecTag = X[0]

        if oCSimilarityFlgas.isTitleSim:
            bi_termsTitle = construct_biterms(titleWords)
            grams_Title = generateGramsConsucetive(titleWords, min_gram,
                                                   max_gram)
            for gram in grams_Title:
                if gram in dic_ngram__txtIds and len(
                        set(dic_ngram__txtIds[gram])) > max_cposts:
                    continue
                dic_ngram__txtIds.setdefault(gram, []).append(id)
            txtBitermsFreqs_Title = Counter(bi_termsTitle)
            bi_terms_len_Title = len(bi_termsTitle)
            tCIds = findTargetClusters(txtBitermsFreqs_Title,
                                       dic_bitermTitle__clusterIds)
            targetClusterIds.extend(tCIds)

            if isSemantic:
                X = generate_sent_vecs_toktextdata([titleWords],
                                                   wordVectorsDic, embedDim)
                text_VecTitle = X[0]

        if oCSimilarityFlgas.isBodySim:
            bi_termsBody = construct_biterms(bodyWords)
            grams_Body = generateGramsConsucetive(bodyWords, min_gram,
                                                  max_gram)
            for gram in grams_Body:
                if gram in dic_ngram__txtIds and len(
                        set(dic_ngram__txtIds[gram])) > max_cposts:
                    continue
                dic_ngram__txtIds.setdefault(gram, []).append(id)
            txtBitermsFreqs_Body = Counter(bi_termsBody)
            bi_terms_len_Body = len(bi_termsBody)
            tCIds = findTargetClusters(txtBitermsFreqs_Body,
                                       dic_bitermBody__clusterIds)
            targetClusterIds.extend(tCIds)

            if isSemantic:
                X = generate_sent_vecs_toktextdata([bodyWords], wordVectorsDic,
                                                   embedDim)
                text_VecBody = X[0]

        oCPostProcessed = CPostProcessed(txtBitermsFreqs_Tag, bi_terms_len_Tag,
                                         txtBitermsFreqs_Title,
                                         bi_terms_len_Title,
                                         txtBitermsFreqs_Body,
                                         bi_terms_len_Body, text_VecTag,
                                         text_VecTitle, text_VecBody)

        targetClusterIds = set(targetClusterIds)

        clusterId = findCloseClusterByTargetClusters_framework(
            c_CFVector, oCPostProcessed, targetClusterIds, max_c_id,
            oCSimilarityFlgas)

        if ignoreMinusOne:
            if str(trueLabel) != '-1':
                f.write(
                    str(clusterId) + "	" + str(trueLabel) + "	" +
                    ' '.join(tagWords) + "	" + str(soPostId) + "\n")
        else:
            f.write(
                str(clusterId) + "	" + str(trueLabel) + "	" +
                ' '.join(tagWords) + "	" + str(soPostId) + "\n")

        eval_pred_true_txt.append([clusterId, trueLabel, tagWords])

        if clusterId not in c_itemsCount:
            c_itemsCount[clusterId] = 0
        c_itemsCount[clusterId] += 1

        max_c_id = max([max_c_id, clusterId, len(c_CFVector)])

        dic_clus__id[clusterId] = max_c_id
        # print('max_c_id, len(c_CFVector)', max_c_id, len(c_CFVector))

        c_CFVector, dic_bitermTag__clusterIds, dic_bitermTitle__clusterIds, dic_bitermBody__clusterIds = populateClusterFeature_framework(
            c_CFVector, oCPostProcessed, dic_bitermTag__clusterIds,
            dic_bitermTitle__clusterIds, dic_bitermBody__clusterIds, clusterId,
            id, oCSimilarityFlgas)

        del oCPostProcessed
        del oCPost

        line_count += 1

        if line_count % DeleteInterval == 0:
            c_CFVector, c_itemsCount = deleteOldClusters_framework(
                c_CFVector, c_itemsCount, dic_clus__id)

        if line_count % 1000 == 0:
            # print('c_itemsCount', c_itemsCount)
            Evaluate(eval_pred_true_txt, ignoreMinusOne)

    return [
        c_CFVector, max_c_id, dic_txtId__CPost, dic_clus__id,
        dic_bitermTag__clusterIds, dic_bitermTitle__clusterIds,
        dic_bitermBody__clusterIds, dic_ngram__txtIds, c_itemsCount
    ]
Exemplo n.º 9
0
def test_cluster_bitermMapping_buffer(
        testList_pred_true_words_index_postid_createtime,
        c_bitermsFreqs={},
        c_totalBiterms={},
        c_wordsFreqs={},
        c_totalWords={},
        c_txtIds={},
        c_clusterVecs={},
        txtId_txt={},
        last_txtId=0,
        max_c_id=0,
        wordVectorsDic={},
        dic_clus__id={},
        dic_biterm__clusterIds={},
        c_textItems={},
        dic_ngram__textItems={},
        min_gram=1,
        max_gram=2,
        max_hitindex=10000):

    eval_pred_true_txt = []

    line_count = 0

    print("testpostId" + "\t" + "trainPostId" + "\t" + "simtype" + "\t" +
          "hitranktype" + "\t" + "Proposed_hit_duration_micro" + "\t" +
          "Proposed_TestTrueLabel" + "\t" + "testText" + "\t" + "trainText" +
          "\t" + "testCreateTime" + "\t" + "TrainCreateTime" + "\t" +
          "DaysDiff")

    for item in testList_pred_true_words_index_postid_createtime:
        t11 = datetime.now()
        pred = item[0]
        testTrue = int(item[1])
        words = item[2]
        testpostId = item[4]
        testDateTime = datetime.strptime(item[5].split("t")[0],
                                         "%Y-%m-%d")  #datetime.now() # item[5]
        #print('testDateTime', item[5])
        bi_terms = construct_biterms(words)

        #print(words, bi_terms, pred)

        #current_txt_id=int(testpostId)

        line_count += 1

        txtBitermsFreqs = Counter(bi_terms)
        bi_terms_len = len(bi_terms)

        txtWordsFreqs = Counter(words)
        words_len = len(words)

        text_Vec = [0] * embedDim
        if isSemantic == True:
            X = generate_sent_vecs_toktextdata([words], wordVectorsDic,
                                               embedDim)
            text_Vec = X[0]

        #text->biterms
        #biterms->targetClusterIds
        #targetClusterIds->txtIds  by c_txtIds
        #txtIds->textItems	by txtId_txt

        targetClusterIds = findTargetClusters(txtBitermsFreqs,
                                              dic_biterm__clusterIds)
        trainItems = findTextItems(targetClusterIds, c_textItems)

        grams = generateGramsConsucetive(words, min_gram, max_gram)
        sortedGrams = list(sorted(grams, key=len, reverse=True))
        train_Items = aggregateTextItems(sortedGrams, dic_ngram__textItems)

        trainItems.extend(train_Items)

        #print('len(targetClusterIds)', len(targetClusterIds), 'len(trainItems)',len(trainItems), words)
        pathCount = 0
        flag = False
        for trainItem in trainItems:
            #list_pred_true_words_index_postid in clustring_term_online_stack=	trainItem
            trainTrue = int(trainItem[1])
            train_words = trainItem[2]
            trainPostId = trainItem[4]

            pathCount += 1

            if str(testTrue) == str(trainTrue):
                #grams=generateGramsConsucetive(words, min_gram, max_gram)
                #sortedGrams = list(sorted(grams, key = len, reverse=True))
                ProposedHitRank_val = int(
                    max(1, math.floor(pathCount / len(sortedGrams))))

                t12 = datetime.now()
                t_diff = t12 - t11
                #print(str(testpostId)+"\t"+str(trainPostId)+"\t0\t0\t0\t0\t"+str(ProposedHitRank_val)+"\t0\t"+str(t_diff.microseconds/1000000)+"\t"+str(testTrue))
                text_sim, commonCount = computeTextSimCommonWord_WordDic(
                    Counter(words), Counter(train_words), len(words),
                    len(train_words))

                trainDateTime = datetime.strptime(trainItem[5].split("t")[0],
                                                  "%Y-%m-%d")  #datetime.now()
                date_diff = trainDateTime - testDateTime
                date_diff = date_diff.days

                print(
                    str(testpostId) + "\t" + str(trainPostId) + "\t" +
                    str(text_sim) + "\t" + str(ProposedHitRank_val) + "\t" +
                    str(t_diff.microseconds / float(microDivide)) + "\t" +
                    str(testTrue) + "\t" + ' '.join(words) + "\t" +
                    ' '.join(train_words) + "\t" + str(trainDateTime) + "\t" +
                    str(testDateTime) + "\t" + str(date_diff))
                flag = True
                break

            if pathCount > max_hitindex:
                break

        if flag == False:
            '''grams=generateGramsConsucetive(words, min_gram, max_gram)		 
      sortedGrams = list(sorted(grams, key = len, reverse=True))

      flag=False  
      largestGram='' 
      ProposedHitRank=0  
       
      train_Items=aggregateTextItems(sortedGrams, dic_ngram__textItems)
      #print("len(train_Items)", len(train_Items) ) 
      for train_item in train_Items:
        ProposedHitRank+=1	
        
	  
        trainTruelabel=train_item[1]
        train_words=train_item[2]
        trainPostId=train_item[4]	

	  
        if str(trainTruelabel)==str(testTrue):
     
          t12=datetime.now()	  
          t_diff = t12-t11 	
	  
          text_sim, commonCount = computeTextSimCommonWord_WordDic(Counter(words), Counter(train_words), len(words), len(train_words) )	  
          ProposedHitRank_val=int(max(1,math.floor(ProposedHitRank/len(sortedGrams))))	  
      	
          trainDateTime= datetime.strptime(train_item[5].split("t")[0] ,"%Y-%m-%d") #datetime.now()
          date_diff=trainDateTime-testDateTime
          date_diff=date_diff.days      	  
	  
          print(str(testpostId)+"\t"+str(trainPostId)+"\t"+str(text_sim)+"\t"+str(ProposedHitRank_val)+"\t"+str(t_diff.microseconds/float(microDivide))+"\t"+str(testTrue)+"\t"+' '.join(words)+"\t"+' '.join(train_words)+"\t"+str(trainDateTime)+"\t"+str(testDateTime)+"\t"+str(date_diff)) 		
          flag=True		
          break 

        if ProposedHitRank > max_hitindex:
          break'''

            if flag == False:
                #print('not found', 'testTrue', testTrue, 'testwords', words,'postId', postId, 'pathCount', pathCount, 'len(targetClusterIds)', len(targetClusterIds))
                t12 = datetime.now()
                t_diff = t12 - t11
                #print(str(testpostId)+"\t"+"-100"+"\t0\t0\t0\t0\t-100"+"\t0\t"+str(t_diff.microseconds/1000000)+"\t"+str(testTrue))
                print(
                    str(testpostId) + "\t" + "-100" + "\t0\t" + str(-100) +
                    "\t" + str(t_diff.microseconds / float(microDivide)) +
                    "\t" + str(testTrue) + "\t" + ' '.join(words) + "\t" + "" +
                    "\t" + "" + "\t" + "" + "\t" + "")
Exemplo n.º 10
0
def test_cluster_bitermMapping_buffer_framework(
        list_CPost_test, c_CFVector, dic_txtId__CPost,
        dic_bitermTag__clusterIds, dic_bitermTitle__clusterIds,
        dic_bitermBody__clusterIds, dic_ngram__txtIds, min_gram, max_gram,
        max_hitindex, oCSimilarityFlgas, wordVectorsDic):
    eval_pred_true_txt = []

    line_count = 0

    fileWrite = open(outfileName, 'w')

    fileWrite.write("testpostId" + "\t" + "trainPostId" + "\t" + "similarity" +
                    "\t" + "Proposed_hitrank" + "\t" +
                    "Proposed_hit_duration_micro" + "\t" +
                    "Proposed_TestTrueLabel" + "\t" + "testText" + "\t" +
                    "trainText" + "\t" + "testCreateTime" + "\t" +
                    "TrainCreateTime" + "\t" + "DaysDiff" + "\t" +
                    "OriginalRank" + "\n")
    print("testpostId" + "\t" + "trainPostId" + "\t" + "similarity" + "\t" +
          "Proposed_hitrank" + "\t" + "Proposed_hit_duration_micro" + "\t" +
          "Proposed_TestTrueLabel" + "\t" + "testText" + "\t" + "trainText" +
          "\t" + "testCreateTime" + "\t" + "TrainCreateTime" + "\t" +
          "DaysDiff" + "\t" + "OriginalRank")

    for oCPost in list_CPost_test:
        t11 = datetime.now()

        testTrue = oCPost.trueLabel
        tagWords = oCPost.tagWords
        titleWords = oCPost.titleWords
        bodyWords = oCPost.bodyWords
        # id = oCPost.id  # may not be useful for test
        testpostId = oCPost.soPostId
        testCreatetime = oCPost.createtime

        testWords = tagWords  # this can be changed

        txtBitermsFreqs_Tag = None
        bi_terms_len_Tag = 0
        grams_Tag = None

        txtBitermsFreqs_Title = None
        bi_terms_len_Title = 0
        grams_Title = None

        txtBitermsFreqs_Body = None
        bi_terms_len_Body = 0
        grams_Body = None

        text_VecTag = None
        text_VecTitle = None
        text_VecBody = None
        targetClusterIds = []
        grams = []

        line_count += 1

        # text->biterms
        # biterms->targetClusterIds
        # targetClusterIds->txtIds  by c_txtIds
        # txtIds->textItems	by txtId_txt

        if oCSimilarityFlgas.isTagSim:
            bi_termsTag = construct_biterms(tagWords)
            grams_Tag = generateGramsConsucetive(tagWords, min_gram, max_gram)
            grams.extend(grams_Tag)

            txtBitermsFreqs_Tag = Counter(bi_termsTag)
            bi_terms_len_Tag = len(bi_termsTag)
            tCIds = findTargetClusters(txtBitermsFreqs_Tag,
                                       dic_bitermTag__clusterIds)
            targetClusterIds.extend(tCIds)

            if isSemantic:
                X = generate_sent_vecs_toktextdata([tagWords], wordVectorsDic,
                                                   embedDim)
                text_VecTag = X[0]

        if oCSimilarityFlgas.isTitleSim:
            bi_termsTitle = construct_biterms(titleWords)
            grams_Title = generateGramsConsucetive(titleWords, min_gram,
                                                   max_gram)
            grams.extend(grams_Title)

            txtBitermsFreqs_Title = Counter(bi_termsTitle)
            bi_terms_len_Title = len(bi_termsTitle)
            tCIds = findTargetClusters(txtBitermsFreqs_Title,
                                       dic_bitermTitle__clusterIds)
            targetClusterIds.extend(tCIds)

            if isSemantic:
                X = generate_sent_vecs_toktextdata([titleWords],
                                                   wordVectorsDic, embedDim)
                text_VecTitle = X[0]

        if oCSimilarityFlgas.isBodySim:
            bi_termsBody = construct_biterms(bodyWords)
            grams_Body = generateGramsConsucetive(bodyWords, min_gram,
                                                  max_gram)
            grams.extend(grams_Body)

            txtBitermsFreqs_Body = Counter(bi_termsBody)
            bi_terms_len_Body = len(bi_termsBody)
            tCIds = findTargetClusters(txtBitermsFreqs_Body,
                                       dic_bitermBody__clusterIds)
            targetClusterIds.extend(tCIds)

            if isSemantic:
                X = generate_sent_vecs_toktextdata([bodyWords], wordVectorsDic,
                                                   embedDim)
                text_VecBody = X[0]

        oCPostProcessed = CPostProcessed(txtBitermsFreqs_Tag, bi_terms_len_Tag,
                                         txtBitermsFreqs_Title,
                                         bi_terms_len_Title,
                                         txtBitermsFreqs_Body,
                                         bi_terms_len_Body, text_VecTag,
                                         text_VecTitle, text_VecBody)

        targetClusterIds = set(targetClusterIds)
        closeClusterIds = findCloseClustersIds_framework(
            oCPostProcessed, targetClusterIds, c_CFVector, oCSimilarityFlgas)
        train_cluster_CPosts = findTextItems_framework(closeClusterIds,
                                                       c_CFVector,
                                                       dic_txtId__CPost)
        # train_cluster_CPosts = filterTextItems_framework(train_cluster_CPosts, oCSimilarityFlgas, oCPostProcessed)

        sortedGrams = list(sorted(grams, key=len, reverse=True))
        train_gram_CPosts = aggregateTextItems_framework(
            sortedGrams, dic_ngram__txtIds, dic_txtId__CPost)

        train_gram_CPosts.extend(train_cluster_CPosts)

        # train_Items.extend(trainItems)

        # print('len(train_gram_CPosts)', len(train_gram_CPosts), 'len(targetClusterIds)', len(targetClusterIds))
        pathCount = 0
        flag = False
        for trainCPost in train_gram_CPosts:
            trainTrue = int(str(trainCPost.trueLabel))
            train_words = trainCPost.tagWords  # this can be changed
            trainPostId = trainCPost.soPostId
            trainCreateTime = trainCPost.createtime

            pathCount += 1

            if str(testTrue) == str(trainTrue):
                ProposedHitRank_val = int(
                    max(1, math.floor(pathCount / len(sortedGrams))))

                t12 = datetime.now()
                t_diff = t12 - t11
                text_sim, commonCount = computeTextSimCommonWord_WordDic(
                    Counter(testWords), Counter(train_words), len(testWords),
                    len(train_words))

                date_diff = trainCreateTime - testCreatetime
                date_diff = date_diff.days

                # "testpostId" + "\t" + "trainPostId" + "\t" + "simtype" + "\t" + "hitranktype" + "\t" + "Proposed_hit_duration_micro" + "\t" + "Proposed_TestTrueLabel" + "\t" + "testText" + "\t" + "trainText" + "\t" + "testCreateTime" + "\t" + "TrainCreateTime" + "\t" + "DaysDiff" + "\t" + "OriginalRank"
                print(
                    str(testpostId) + "\t" + str(trainPostId) + "\t" +
                    str(text_sim) + "\t" + str(ProposedHitRank_val) + "\t" +
                    str(t_diff.microseconds / float(microDivide)) + "\t" +
                    str(testTrue) + "\t" + ' '.join(testWords) + "\t" +
                    ' '.join(train_words) + "\t" + str(testCreatetime) + "\t" +
                    str(trainCreateTime) + "\t" + str(date_diff) + "\t" +
                    str(pathCount))

                fileWrite.write(
                    str(testpostId) + "\t" + str(trainPostId) + "\t" +
                    str(text_sim) + "\t" + str(ProposedHitRank_val) + "\t" +
                    str(t_diff.microseconds / float(microDivide)) + "\t" +
                    str(testTrue) + "\t" + ' '.join(testWords) + "\t" +
                    ' '.join(train_words) + "\t" + str(testCreatetime) + "\t" +
                    str(trainCreateTime) + "\t" + str(date_diff) + "\t" +
                    str(pathCount) + "\n")

                flag = True
                break

            if pathCount > max_hitindex:
                break

        if not flag:
            t12 = datetime.now()
            t_diff = t12 - t11
            print(
                str(testpostId) + "\t" + "-100" + "\t0\t" + str(-100) + "\t" +
                str(t_diff.microseconds / float(microDivide)) + "\t" +
                str(testTrue) + "\t" + ' '.join(testWords) + "\t" + "" + "\t" +
                "" + "\t" + "" + "\t" + "" + "\t" + "")

            fileWrite.write(
                str(testpostId) + "\t" + "-100" + "\t0\t" + str(-100) + "\t" +
                str(t_diff.microseconds / float(microDivide)) + "\t" +
                str(testTrue) + "\t" + ' '.join(testWords) + "\t" + "" + "\t" +
                "" + "\t" + "" + "\t" + "" + "\t" + "" + "\n")

    fileWrite.close()