def writeQwordsDfByDayToMatrix(qwordsDfByDayFile, queriesDict, maxDay, matrixFile):
    handle = open(matrixFile, 'w')
    handle.write('qid_qword, qwordDfByDayBeforeQuery \n')
    handle.write('day' + '\t')
    for i in range(0, maxDay + 1):
        handle.write(str(i) + '\t')
    handle.write('\n')
    
    qwordsDfByDay = getPickleData(qwordsDfByDayFile)
    keyList = queriesDict.keys()
    keyList.sort()
    for qid in keyList:
        queryStr = queriesDict[qid]
        qwords = queryStr.split()
        for qword in qwords:
            key = qid + '_' + qword
            if qwordsDfByDay.has_key(key):
                handle.write(key + '\t')
                dfByDays = qwordsDfByDay[key]
                for i in range(0, maxDay + 1):
                    if dfByDays.has_key(i):
                        handle.write(str(dfByDays[i]) + '\t')
                    else:
                        handle.write('0' + '\t')
                handle.write('\n')
        
    handle.close()
def getMaxDay(qwordsDfByDayFile):
    dayDict = {}
    qwordsDfByDay = getPickleData(qwordsDfByDayFile)
    for key in qwordsDfByDay.keys():
        qwordDfByDay = qwordsDfByDay[key]
        for day in qwordDfByDay.keys():
            if not dayDict.has_key(day):
                dayDict[day] = 1
    dayList = dayDict.keys()
    maxDay = max(dayList)
    return maxDay
    plt.title(qid)  # 设置图标题
    


if __name__=='__main__':
    year = '2012'
    topN = 100
    maxDay = 16   # 2011,2012: 16  ; 2013, 2014: 58
    daysList = [i for i in range(0, maxDay + 1)]
    
    queryTimeFile = 'E:\\eclipse\\QueryExpansion\\data\\QueryTime\\' + year + '.MBid_query_time.txt'
    tweetsEpochFile = 'E:\\eclipse\\TemporalRetrieval\\data\\pickle_data\\tweetsEpoch\\tweetsEpoch_'+ year + '.pkl'
    qrelFile = 'E:\\eclipse\\QueryExpansion\\data\\qrels\\' + 'qrels.microblog' + year + '_new.txt'
    kdePrfTimeFile ='E:\\eclipse\\TemporalRetrieval\\data\\pickle_data\\KDE\\' + year + '\\kde_prf' + str(topN) +'_' + year + '.pkl' 
    
    kdeDict = getPickleData(kdePrfTimeFile)
    queriesEpoch = getQueriesEpoch(queryTimeFile, year)
    tweetsEpoch = getPickleData(tweetsEpochFile)
    relevantResults = relevantGet(qrelFile)
    relevantTimeSpan = getResultsTimeSpan(relevantResults, tweetsEpoch, queriesEpoch)
    
    x1 = np.array(daysList, dtype=np.float)
    for qid in kdeDict.keys():
        probDens = prediction(kdeDict[qid], x1)
        y1 = probDens
        drawHistLine(relevantTimeSpan[qid], x1, y1, maxDay, qid)
        figPath = 'E:\eclipse\TemporalRetrieval\data\img\\' + qid + '.png'
        plt.savefig(figPath)
        plt.close()
        print  'draw for ' + qid 
        
Пример #4
0
        print qid + ', best bandwidth: ' +  str(bandwidth)        
    return (bandwidthDict, kdeDict)


  


if __name__=='__main__' :
    year = '2012'
    topNList = [i for i in range(50, 501, 50)]
    
    for topN in topNList:
        queryTimeFile = 'E:\\eclipse\\QueryExpansion\\data\\QueryTime\\' + year + '.MBid_query_time.txt'
        tweetsEpochFile = 'E:\\eclipse\\TemporalRetrieval\\data\\pickle_data\\tweetsEpoch\\tweetsEpoch_'+ year + '.pkl'
        resultFile = 'E:\\eclipse\\QueryExpansion\\data\\BM25\\BM25_' + year + '.txt'
        bandwidthPrfTimeFile = 'E:\\eclipse\\TemporalRetrieval\\data\\pickle_data\\KDE\\' + year + '\\band_prf' + str(topN) +'_' + year + '.pkl' 
        kdePrfTimeFile ='E:\\eclipse\\TemporalRetrieval\\data\\pickle_data\\KDE\\' + year + '\\kde_prf' + str(topN) +'_' + year + '.pkl' 
        
        # train the kde estimator
        topNResults = getTopNResults(resultFile, topN)
        queriesEpoch = getQueriesEpoch(queryTimeFile, year)
        tweetsEpoch = getPickleData(tweetsEpochFile)
        (bandwidthDict, kdeDict) = prfTimeKDE(topNResults, queriesEpoch, tweetsEpoch)
        writePickleData(bandwidthDict, bandwidthPrfTimeFile)
        writePickleData(kdeDict, kdePrfTimeFile)
    
    



Пример #5
0
        if term != '':
            termPos = getTermPos(term, classicPosDict, jiebaPosDict)
            segPosList.append([term, termPos])
    return segPosList
        
        


            



if __name__=='__main__':
    termPosPickleFile = '../data/result/mergedPosDictionary.pkl'   # classic pos dictionary
    addedJiebaDictFile = '../data/result/addedJiebaDict.pkl'
    classicPosDict = getPickleData(termPosPickleFile) 
    jiebaPosDict = getPickleData(addedJiebaDictFile) 
#     termPosFile = '../data/result/mergedPosDictionary.txt'
#     termPosDict = getTermPosDict(termPosFile)
  
    
    s1 = ('一牒  。十二月庚子朔丙寅,偏將軍')
    s2 = "李小福是创新办主任也是云计算方面的专家; 什么是八一双鹿"
    seg_list = jieba.cut(s1)
    print("/ ".join(seg_list))
    
    segPosList = getSegPos(s1, classicPosDict, jiebaPosDict)
    filePath = '../data/result/test.txt'
    handle = open(filePath, 'w')
    for segPos in segPosList:
        term = segPos[0]
Пример #6
0
            
if __name__=='__main__':
    year = '2011'
    topN = 100
    
    stopFilePath = 'E:\\eclipse\\QueryExpansion\\data\\english.stop'
    indexedFile = 'E:\\eclipse\\TemporalRetrieval\\data\\pickle_data\\index\\' + 'tweet_index_' + year + '.pkl'
    queryTimeFile = 'E:\\eclipse\\QueryExpansion\\data\\QueryTime\\' + year + '.MBid_query_time.txt'
    tweetsEpochFile = 'E:\\eclipse\\TemporalRetrieval\\data\\pickle_data\\tweetsEpoch\\tweetsEpoch_'+ year + '.pkl'
    resultFile = 'E:\\eclipse\\QueryExpansion\\data\\BM25\\BM25_' + year + '.txt'
    bandwidthQwordTimeFile = '../data/pickle_data/KDE/' + year + '/qword_time/band_qword' + str(topN) + '_' + year + '.pkl'
    kdeQwordTimeFile = '../data/pickle_data/KDE/' + year + '/qword_time/kde_qword' + str(topN) + '_' + year + '.pkl'
    
    topNResults = getTopNResults(resultFile, topN)
    queriesEpoch = getQueriesEpoch(queryTimeFile, year)
    tweetsEpoch = getPickleData(tweetsEpochFile)
    stopWords = stopWordsGet(stopFilePath)
    queriesDict = getProcessedQueries(queryTimeFile, stopWords)
    wordsIndex = getPickleData(indexedFile)
    (bandwidthDict, kdeDict) = qwordTimeKDE(topNResults, queriesEpoch, tweetsEpoch, queriesDict, wordsIndex)
    writePickleData(bandwidthDict, bandwidthQwordTimeFile)
    writePickleData(kdeDict, kdeQwordTimeFile)
    
    

       
            
    
    
    
        
Пример #7
0
def normalizePosCount(posDict):
    for term in posDict.keys():
        totalCount = sum(posDict[term].values())
        for pos in posDict[term].keys():
            posDict[term][pos] = 1.0 * posDict[term][pos] / totalCount
    return posDict


if __name__=='__main__':
    #-----------------------------------------秦简库上网(单音1)-------------------------------------  
    qinJianDanYinFilePath = '../data/自动标注材料/词典/秦简库上网(单音词词性标注1)/单音词词性.txt'
    qinJianDanYinDict = getQinJianDanYinTerms(qinJianDanYinFilePath)
    qinJianPosDict = getPosSet(qinJianDanYinDict)
    
    #-----------------------------------------古汉语常用字字典(单音2)-------------------------------------  
    hanDanYinFilePath = '../data/自动标注材料/词典/古汉语常用字字典(单音词词性标注2)/单字词性表.txt'
    hanDanYinDict = getHanDanYinTerms(hanDanYinFilePath) 
    hanDanYinDictU = unifyPosNames(hanDanYinDict, posNamesMap) 
    hanDanYinPosDict = getPosSet(hanDanYinDictU) 
    
    posDictList = [qinJianPosDict, hanDanYinPosDict]
#     posUnion = getPosUnion(posDictList)
    posUnionFile = '../data/result/posUnionDict.pkl'
#     writePickleData(posUnion, posUnionFile) 
    posUnion = getPickleData(posUnionFile)
    for pos in posUnion.keys():
        print pos
    
    
    
Пример #8
0
    year = '2011'
    topN = 1000
    kdeN = 100
    tag = 'myBM25T'
    k1 = 0.3
    b = 0.05
    
    stopFilePath = 'E:\\eclipse\\QueryExpansion\\data\\english.stop'
    indexedFile = 'E:\\eclipse\\TemporalRetrieval\\data\\pickle_data\\index\\' + 'tweet_index_' + year + '.pkl'
    queryTimeFile = 'E:\\eclipse\\QueryExpansion\\data\\QueryTime\\' + year + '.MBid_query_time.txt'
    tweetsEpochFile = 'E:\\eclipse\\TemporalRetrieval\\data\\pickle_data\\tweetsEpoch\\tweetsEpoch_'+ year + '.pkl'
    docsFilePath = 'E:\\eclipse\\QueryExpansion\\dataset\\processed\\tweet' + year + '_processed.txt'
    resultFilePath = '../data/rank_BM25/' + year + '/' + tag +'_k'+ str(k1) + '_b' + str(b)  + '.txt'
    kdeQwordTimeFile = '../data/pickle_data/KDE/' + year + '/qword_time/kde_qword' + str(kdeN) + '_' + year + '.pkl'   
      
    stopWords = stopWordsGet(stopFilePath)
    queriesDict = getProcessedQueries(queryTimeFile, stopWords)
    wordsIndex = getPickleData(indexedFile)
    docsLength = getDocsLength(docsFilePath)
    queriesEpoch = getQueriesEpoch(queryTimeFile, year)
    tweetsEpoch = getPickleData(tweetsEpochFile)
    kdeQwordDict = getPickleData(kdeQwordTimeFile)
    scores = scoreBM25T(queriesEpoch, tweetsEpoch, kdeQwordDict, queriesDict, wordsIndex, docsLength, k1, b)
    topNResults = getTopNResults(scores, topN)
    writeTopNResults(topNResults, resultFilePath, tag)
    
    



    
    


if __name__=='__main__':
    year = '2011'
    topN = 100
    maxDay = 16   # 2011,2012: 16  ; 2013, 2014: 58
    
    queryTimeFile = 'E:\\eclipse\\QueryExpansion\\data\\QueryTime\\' + year + '.MBid_query_time.txt'
    tweetsEpochFile = 'E:\\eclipse\\TemporalRetrieval\\data\\pickle_data\\tweetsEpoch\\tweetsEpoch_'+ year + '.pkl'
    qrelFile = 'E:\\eclipse\\QueryExpansion\\data\\qrels\\' + 'qrels.microblog' + year + '_new.txt'
    kdePrfTimeFile ='E:\\eclipse\\TemporalRetrieval\\data\\pickle_data\\KDE\\' + year + '\\prf_time\\kde_prf' + str(topN) +'_' + year + '.pkl' 
    kdeQwordTimeFile = '../data/pickle_data/KDE/' + year + '/qword_time/kde_qword' + str(topN) + '_' + year + '.pkl'
    
    queriesEpoch = getQueriesEpoch(queryTimeFile, year)
    tweetsEpoch = getPickleData(tweetsEpochFile)
    relevantResults = relevantGet(qrelFile)
    relevantTimeSpans = getResultsTimeSpan(relevantResults, tweetsEpoch, queriesEpoch)
    kdePrfDict = getPickleData(kdePrfTimeFile)
    prfProbDens = predictDaysProbDens(kdePrfDict, maxDay)
    kdeQwordDict = getPickleData(kdeQwordTimeFile)
    qwordsProbDens = predictDaysProbDens(kdeQwordDict, maxDay)
    keyList = kdePrfDict.keys()
#     keyList = ['MB1']
    for qid in keyList:
        drawHistLine(relevantTimeSpans, prfProbDens, qwordsProbDens, maxDay, qid)
        figPath = 'E:\eclipse\TemporalRetrieval\data\img\\rel_prf_qword\\' + qid + '.png'
        plt.savefig(figPath)
        plt.close()
        print  'draw for ' + qid 
        
Пример #10
0
                    scores[qid].setdefault(docId, 0)
                    scores[qid][docId] += s
    return scores
                
            

if __name__=='__main__':
    year = '2012'
    topN = 1000
    tag = 'myBM25'
    k1 = 0.3
    b = 0.02
    
    stopFilePath = 'E:\\eclipse\\QueryExpansion\\data\\english.stop'
    indexedFile = 'E:\\eclipse\\TemporalRetrieval\\data\\pickle_data\\index\\' + 'tweet_index_' + year + '.pkl'
    queryTimeFile = 'E:\\eclipse\\QueryExpansion\\data\\QueryTime\\' + year + '.MBid_query_time.txt'
    docsFilePath = 'E:\\eclipse\\QueryExpansion\\dataset\\processed\\tweet' + year + '_processed.txt'
    resultFilePath = '../data/rank_BM25/' + year + '/myBM25_k'+ str(k1) + '_b' + str(b)  + '.txt'   
      
    stopWords = stopWordsGet(stopFilePath)
    queriesDict = getProcessedQueries(queryTimeFile, stopWords)
    wordsIndex = getPickleData(indexedFile)
    docsLength = getDocsLength(docsFilePath)
    scores = scoreBM25(queriesDict, wordsIndex, docsLength, k1, b)
    topNResults = getTopNResults(scores, topN)
    writeTopNResults(topNResults, resultFilePath, tag)
    
    
    
    
    
    #-----------------------------------------汉代分类词库------------------------------------- 
    hanClassifiedFileDir = '../data/自动标注材料/词典/汉代分类词库/'
    hanClassifiedFileNames = ['表_地名.txt', '表_年号.txt', '表_人名.txt', '表_虚词.txt', '表_职官.txt', '干支.txt']
    labelsList = ['地名', '年号', '人名', '虚词', '职官', '干支']
    hanClassifiedDictList = list()
    for i in range(0, len(hanClassifiedFileNames)):
        filePath = hanClassifiedFileDir + hanClassifiedFileNames[i]
        termsDict = getClassifiedTerms(filePath, labelsList[i])
        hanClassifiedDictList.append(termsDict)
    #-----------------------------------------汉语大词典(复音词词典)------------------------------------- 
#     hanFuYinFilePath = '../data/自动标注材料/词典/汉语大词典(复音词词典)/汉语大词典(第一版)词头.txt'
#     label = '复音词'
#     hanFuYinDict =  getHanFuYinTerms(hanFuYinFilePath, label) 
    
    hanFuYinDictPredictFile = '../data/result/hanFuYin_PosDict_predict.pkl'
    hanFuYinDict = getPickleData(hanFuYinDictPredictFile)
    
    #-----------------------------------------秦简库上网(单音1)-------------------------------------  
    qinJianDanYinFilePath = '../data/自动标注材料/词典/秦简库上网(单音词词性标注1)/单音词词性.txt'
    qinJianDanYinDict = getQinJianDanYinTerms(qinJianDanYinFilePath)
    
    #-----------------------------------------古汉语常用字字典(单音2)-------------------------------------  
    hanDanYinFilePath = '../data/自动标注材料/词典/古汉语常用字字典(单音词词性标注2)/单字词性表.txt'
    hanDanYinDict = getHanDanYinTerms(hanDanYinFilePath) 
    hanDanYinDictU = unifyPosNames(hanDanYinDict, posNamesMap)
     
    mergedDanYinPosDict = merge_qinJianDanYin_hanDanYin(qinJianDanYinDict, hanDanYinDictU)
    mergedPosDict = mergeAllPosDictionary(hanClassifiedDictList, hanFuYinDict, mergedDanYinPosDict)
#     termPosFile = '../data/result/mergedPosDictionary.txt'
#     writeTermPosDict(mergedPosDict, termPosFile)