def writeQwordsDfByDayToMatrix(qwordsDfByDayFile, queriesDict, maxDay, matrixFile): handle = open(matrixFile, 'w') handle.write('qid_qword, qwordDfByDayBeforeQuery \n') handle.write('day' + '\t') for i in range(0, maxDay + 1): handle.write(str(i) + '\t') handle.write('\n') qwordsDfByDay = getPickleData(qwordsDfByDayFile) keyList = queriesDict.keys() keyList.sort() for qid in keyList: queryStr = queriesDict[qid] qwords = queryStr.split() for qword in qwords: key = qid + '_' + qword if qwordsDfByDay.has_key(key): handle.write(key + '\t') dfByDays = qwordsDfByDay[key] for i in range(0, maxDay + 1): if dfByDays.has_key(i): handle.write(str(dfByDays[i]) + '\t') else: handle.write('0' + '\t') handle.write('\n') handle.close()
def getMaxDay(qwordsDfByDayFile): dayDict = {} qwordsDfByDay = getPickleData(qwordsDfByDayFile) for key in qwordsDfByDay.keys(): qwordDfByDay = qwordsDfByDay[key] for day in qwordDfByDay.keys(): if not dayDict.has_key(day): dayDict[day] = 1 dayList = dayDict.keys() maxDay = max(dayList) return maxDay
plt.title(qid) # 设置图标题 if __name__=='__main__': year = '2012' topN = 100 maxDay = 16 # 2011,2012: 16 ; 2013, 2014: 58 daysList = [i for i in range(0, maxDay + 1)] queryTimeFile = 'E:\\eclipse\\QueryExpansion\\data\\QueryTime\\' + year + '.MBid_query_time.txt' tweetsEpochFile = 'E:\\eclipse\\TemporalRetrieval\\data\\pickle_data\\tweetsEpoch\\tweetsEpoch_'+ year + '.pkl' qrelFile = 'E:\\eclipse\\QueryExpansion\\data\\qrels\\' + 'qrels.microblog' + year + '_new.txt' kdePrfTimeFile ='E:\\eclipse\\TemporalRetrieval\\data\\pickle_data\\KDE\\' + year + '\\kde_prf' + str(topN) +'_' + year + '.pkl' kdeDict = getPickleData(kdePrfTimeFile) queriesEpoch = getQueriesEpoch(queryTimeFile, year) tweetsEpoch = getPickleData(tweetsEpochFile) relevantResults = relevantGet(qrelFile) relevantTimeSpan = getResultsTimeSpan(relevantResults, tweetsEpoch, queriesEpoch) x1 = np.array(daysList, dtype=np.float) for qid in kdeDict.keys(): probDens = prediction(kdeDict[qid], x1) y1 = probDens drawHistLine(relevantTimeSpan[qid], x1, y1, maxDay, qid) figPath = 'E:\eclipse\TemporalRetrieval\data\img\\' + qid + '.png' plt.savefig(figPath) plt.close() print 'draw for ' + qid
print qid + ', best bandwidth: ' + str(bandwidth) return (bandwidthDict, kdeDict) if __name__=='__main__' : year = '2012' topNList = [i for i in range(50, 501, 50)] for topN in topNList: queryTimeFile = 'E:\\eclipse\\QueryExpansion\\data\\QueryTime\\' + year + '.MBid_query_time.txt' tweetsEpochFile = 'E:\\eclipse\\TemporalRetrieval\\data\\pickle_data\\tweetsEpoch\\tweetsEpoch_'+ year + '.pkl' resultFile = 'E:\\eclipse\\QueryExpansion\\data\\BM25\\BM25_' + year + '.txt' bandwidthPrfTimeFile = 'E:\\eclipse\\TemporalRetrieval\\data\\pickle_data\\KDE\\' + year + '\\band_prf' + str(topN) +'_' + year + '.pkl' kdePrfTimeFile ='E:\\eclipse\\TemporalRetrieval\\data\\pickle_data\\KDE\\' + year + '\\kde_prf' + str(topN) +'_' + year + '.pkl' # train the kde estimator topNResults = getTopNResults(resultFile, topN) queriesEpoch = getQueriesEpoch(queryTimeFile, year) tweetsEpoch = getPickleData(tweetsEpochFile) (bandwidthDict, kdeDict) = prfTimeKDE(topNResults, queriesEpoch, tweetsEpoch) writePickleData(bandwidthDict, bandwidthPrfTimeFile) writePickleData(kdeDict, kdePrfTimeFile)
if term != '': termPos = getTermPos(term, classicPosDict, jiebaPosDict) segPosList.append([term, termPos]) return segPosList if __name__=='__main__': termPosPickleFile = '../data/result/mergedPosDictionary.pkl' # classic pos dictionary addedJiebaDictFile = '../data/result/addedJiebaDict.pkl' classicPosDict = getPickleData(termPosPickleFile) jiebaPosDict = getPickleData(addedJiebaDictFile) # termPosFile = '../data/result/mergedPosDictionary.txt' # termPosDict = getTermPosDict(termPosFile) s1 = ('一牒 。十二月庚子朔丙寅,偏將軍') s2 = "李小福是创新办主任也是云计算方面的专家; 什么是八一双鹿" seg_list = jieba.cut(s1) print("/ ".join(seg_list)) segPosList = getSegPos(s1, classicPosDict, jiebaPosDict) filePath = '../data/result/test.txt' handle = open(filePath, 'w') for segPos in segPosList: term = segPos[0]
if __name__=='__main__': year = '2011' topN = 100 stopFilePath = 'E:\\eclipse\\QueryExpansion\\data\\english.stop' indexedFile = 'E:\\eclipse\\TemporalRetrieval\\data\\pickle_data\\index\\' + 'tweet_index_' + year + '.pkl' queryTimeFile = 'E:\\eclipse\\QueryExpansion\\data\\QueryTime\\' + year + '.MBid_query_time.txt' tweetsEpochFile = 'E:\\eclipse\\TemporalRetrieval\\data\\pickle_data\\tweetsEpoch\\tweetsEpoch_'+ year + '.pkl' resultFile = 'E:\\eclipse\\QueryExpansion\\data\\BM25\\BM25_' + year + '.txt' bandwidthQwordTimeFile = '../data/pickle_data/KDE/' + year + '/qword_time/band_qword' + str(topN) + '_' + year + '.pkl' kdeQwordTimeFile = '../data/pickle_data/KDE/' + year + '/qword_time/kde_qword' + str(topN) + '_' + year + '.pkl' topNResults = getTopNResults(resultFile, topN) queriesEpoch = getQueriesEpoch(queryTimeFile, year) tweetsEpoch = getPickleData(tweetsEpochFile) stopWords = stopWordsGet(stopFilePath) queriesDict = getProcessedQueries(queryTimeFile, stopWords) wordsIndex = getPickleData(indexedFile) (bandwidthDict, kdeDict) = qwordTimeKDE(topNResults, queriesEpoch, tweetsEpoch, queriesDict, wordsIndex) writePickleData(bandwidthDict, bandwidthQwordTimeFile) writePickleData(kdeDict, kdeQwordTimeFile)
def normalizePosCount(posDict): for term in posDict.keys(): totalCount = sum(posDict[term].values()) for pos in posDict[term].keys(): posDict[term][pos] = 1.0 * posDict[term][pos] / totalCount return posDict if __name__=='__main__': #-----------------------------------------秦简库上网(单音1)------------------------------------- qinJianDanYinFilePath = '../data/自动标注材料/词典/秦简库上网(单音词词性标注1)/单音词词性.txt' qinJianDanYinDict = getQinJianDanYinTerms(qinJianDanYinFilePath) qinJianPosDict = getPosSet(qinJianDanYinDict) #-----------------------------------------古汉语常用字字典(单音2)------------------------------------- hanDanYinFilePath = '../data/自动标注材料/词典/古汉语常用字字典(单音词词性标注2)/单字词性表.txt' hanDanYinDict = getHanDanYinTerms(hanDanYinFilePath) hanDanYinDictU = unifyPosNames(hanDanYinDict, posNamesMap) hanDanYinPosDict = getPosSet(hanDanYinDictU) posDictList = [qinJianPosDict, hanDanYinPosDict] # posUnion = getPosUnion(posDictList) posUnionFile = '../data/result/posUnionDict.pkl' # writePickleData(posUnion, posUnionFile) posUnion = getPickleData(posUnionFile) for pos in posUnion.keys(): print pos
year = '2011' topN = 1000 kdeN = 100 tag = 'myBM25T' k1 = 0.3 b = 0.05 stopFilePath = 'E:\\eclipse\\QueryExpansion\\data\\english.stop' indexedFile = 'E:\\eclipse\\TemporalRetrieval\\data\\pickle_data\\index\\' + 'tweet_index_' + year + '.pkl' queryTimeFile = 'E:\\eclipse\\QueryExpansion\\data\\QueryTime\\' + year + '.MBid_query_time.txt' tweetsEpochFile = 'E:\\eclipse\\TemporalRetrieval\\data\\pickle_data\\tweetsEpoch\\tweetsEpoch_'+ year + '.pkl' docsFilePath = 'E:\\eclipse\\QueryExpansion\\dataset\\processed\\tweet' + year + '_processed.txt' resultFilePath = '../data/rank_BM25/' + year + '/' + tag +'_k'+ str(k1) + '_b' + str(b) + '.txt' kdeQwordTimeFile = '../data/pickle_data/KDE/' + year + '/qword_time/kde_qword' + str(kdeN) + '_' + year + '.pkl' stopWords = stopWordsGet(stopFilePath) queriesDict = getProcessedQueries(queryTimeFile, stopWords) wordsIndex = getPickleData(indexedFile) docsLength = getDocsLength(docsFilePath) queriesEpoch = getQueriesEpoch(queryTimeFile, year) tweetsEpoch = getPickleData(tweetsEpochFile) kdeQwordDict = getPickleData(kdeQwordTimeFile) scores = scoreBM25T(queriesEpoch, tweetsEpoch, kdeQwordDict, queriesDict, wordsIndex, docsLength, k1, b) topNResults = getTopNResults(scores, topN) writeTopNResults(topNResults, resultFilePath, tag)
if __name__=='__main__': year = '2011' topN = 100 maxDay = 16 # 2011,2012: 16 ; 2013, 2014: 58 queryTimeFile = 'E:\\eclipse\\QueryExpansion\\data\\QueryTime\\' + year + '.MBid_query_time.txt' tweetsEpochFile = 'E:\\eclipse\\TemporalRetrieval\\data\\pickle_data\\tweetsEpoch\\tweetsEpoch_'+ year + '.pkl' qrelFile = 'E:\\eclipse\\QueryExpansion\\data\\qrels\\' + 'qrels.microblog' + year + '_new.txt' kdePrfTimeFile ='E:\\eclipse\\TemporalRetrieval\\data\\pickle_data\\KDE\\' + year + '\\prf_time\\kde_prf' + str(topN) +'_' + year + '.pkl' kdeQwordTimeFile = '../data/pickle_data/KDE/' + year + '/qword_time/kde_qword' + str(topN) + '_' + year + '.pkl' queriesEpoch = getQueriesEpoch(queryTimeFile, year) tweetsEpoch = getPickleData(tweetsEpochFile) relevantResults = relevantGet(qrelFile) relevantTimeSpans = getResultsTimeSpan(relevantResults, tweetsEpoch, queriesEpoch) kdePrfDict = getPickleData(kdePrfTimeFile) prfProbDens = predictDaysProbDens(kdePrfDict, maxDay) kdeQwordDict = getPickleData(kdeQwordTimeFile) qwordsProbDens = predictDaysProbDens(kdeQwordDict, maxDay) keyList = kdePrfDict.keys() # keyList = ['MB1'] for qid in keyList: drawHistLine(relevantTimeSpans, prfProbDens, qwordsProbDens, maxDay, qid) figPath = 'E:\eclipse\TemporalRetrieval\data\img\\rel_prf_qword\\' + qid + '.png' plt.savefig(figPath) plt.close() print 'draw for ' + qid
scores[qid].setdefault(docId, 0) scores[qid][docId] += s return scores if __name__=='__main__': year = '2012' topN = 1000 tag = 'myBM25' k1 = 0.3 b = 0.02 stopFilePath = 'E:\\eclipse\\QueryExpansion\\data\\english.stop' indexedFile = 'E:\\eclipse\\TemporalRetrieval\\data\\pickle_data\\index\\' + 'tweet_index_' + year + '.pkl' queryTimeFile = 'E:\\eclipse\\QueryExpansion\\data\\QueryTime\\' + year + '.MBid_query_time.txt' docsFilePath = 'E:\\eclipse\\QueryExpansion\\dataset\\processed\\tweet' + year + '_processed.txt' resultFilePath = '../data/rank_BM25/' + year + '/myBM25_k'+ str(k1) + '_b' + str(b) + '.txt' stopWords = stopWordsGet(stopFilePath) queriesDict = getProcessedQueries(queryTimeFile, stopWords) wordsIndex = getPickleData(indexedFile) docsLength = getDocsLength(docsFilePath) scores = scoreBM25(queriesDict, wordsIndex, docsLength, k1, b) topNResults = getTopNResults(scores, topN) writeTopNResults(topNResults, resultFilePath, tag)
#-----------------------------------------汉代分类词库------------------------------------- hanClassifiedFileDir = '../data/自动标注材料/词典/汉代分类词库/' hanClassifiedFileNames = ['表_地名.txt', '表_年号.txt', '表_人名.txt', '表_虚词.txt', '表_职官.txt', '干支.txt'] labelsList = ['地名', '年号', '人名', '虚词', '职官', '干支'] hanClassifiedDictList = list() for i in range(0, len(hanClassifiedFileNames)): filePath = hanClassifiedFileDir + hanClassifiedFileNames[i] termsDict = getClassifiedTerms(filePath, labelsList[i]) hanClassifiedDictList.append(termsDict) #-----------------------------------------汉语大词典(复音词词典)------------------------------------- # hanFuYinFilePath = '../data/自动标注材料/词典/汉语大词典(复音词词典)/汉语大词典(第一版)词头.txt' # label = '复音词' # hanFuYinDict = getHanFuYinTerms(hanFuYinFilePath, label) hanFuYinDictPredictFile = '../data/result/hanFuYin_PosDict_predict.pkl' hanFuYinDict = getPickleData(hanFuYinDictPredictFile) #-----------------------------------------秦简库上网(单音1)------------------------------------- qinJianDanYinFilePath = '../data/自动标注材料/词典/秦简库上网(单音词词性标注1)/单音词词性.txt' qinJianDanYinDict = getQinJianDanYinTerms(qinJianDanYinFilePath) #-----------------------------------------古汉语常用字字典(单音2)------------------------------------- hanDanYinFilePath = '../data/自动标注材料/词典/古汉语常用字字典(单音词词性标注2)/单字词性表.txt' hanDanYinDict = getHanDanYinTerms(hanDanYinFilePath) hanDanYinDictU = unifyPosNames(hanDanYinDict, posNamesMap) mergedDanYinPosDict = merge_qinJianDanYin_hanDanYin(qinJianDanYinDict, hanDanYinDictU) mergedPosDict = mergeAllPosDictionary(hanClassifiedDictList, hanFuYinDict, mergedDanYinPosDict) # termPosFile = '../data/result/mergedPosDictionary.txt' # writeTermPosDict(mergedPosDict, termPosFile)