text = entry[1] words = text.split() for word in words: wordsIndex.setdefault(word, {}) wordsIndex[word].setdefault(docId, 0) wordsIndex[word][docId] += 1 i += 1 if i % 10000 == 0: print i print 'total documents indexed: ' + str(i) wordsCount = len(wordsIndex.keys()) print 'total words: ' + str(wordsCount) return wordsIndex if __name__=='__main__': year = '2012' # filePath = 'E:\\eclipse\\QueryExpansion\\dataset\\processed\\test.txt' filePath = 'E:\\eclipse\\QueryExpansion\\dataset\\processed\\' + 'tweet' + year + '_processed.txt' indexedFile = 'E:\\eclipse\\TemporalRetrieval\\data\\pickle_data\\index\\' + 'tweet_index_' + year + '.pkl' wordsIndex = invertedIndexing(filePath) writePickleData(wordsIndex, indexedFile)
print qid + ', best bandwidth: ' + str(bandwidth) return (bandwidthDict, kdeDict) if __name__=='__main__' : year = '2012' topNList = [i for i in range(50, 501, 50)] for topN in topNList: queryTimeFile = 'E:\\eclipse\\QueryExpansion\\data\\QueryTime\\' + year + '.MBid_query_time.txt' tweetsEpochFile = 'E:\\eclipse\\TemporalRetrieval\\data\\pickle_data\\tweetsEpoch\\tweetsEpoch_'+ year + '.pkl' resultFile = 'E:\\eclipse\\QueryExpansion\\data\\BM25\\BM25_' + year + '.txt' bandwidthPrfTimeFile = 'E:\\eclipse\\TemporalRetrieval\\data\\pickle_data\\KDE\\' + year + '\\band_prf' + str(topN) +'_' + year + '.pkl' kdePrfTimeFile ='E:\\eclipse\\TemporalRetrieval\\data\\pickle_data\\KDE\\' + year + '\\kde_prf' + str(topN) +'_' + year + '.pkl' # train the kde estimator topNResults = getTopNResults(resultFile, topN) queriesEpoch = getQueriesEpoch(queryTimeFile, year) tweetsEpoch = getPickleData(tweetsEpochFile) (bandwidthDict, kdeDict) = prfTimeKDE(topNResults, queriesEpoch, tweetsEpoch) writePickleData(bandwidthDict, bandwidthPrfTimeFile) writePickleData(kdeDict, kdePrfTimeFile)
stopFilePath = 'E:\\eclipse\\QueryExpansion\\data\\english.stop' indexedFile = 'E:\\eclipse\\TemporalRetrieval\\data\\pickle_data\\index\\' + 'tweet_index_' + year + '.pkl' queryTimeFile = 'E:\\eclipse\\QueryExpansion\\data\\QueryTime\\' + year + '.MBid_query_time.txt' tweetsEpochFile = 'E:\\eclipse\\TemporalRetrieval\\data\\pickle_data\\tweetsEpoch\\tweetsEpoch_'+ year + '.pkl' resultFile = 'E:\\eclipse\\QueryExpansion\\data\\BM25\\BM25_' + year + '.txt' bandwidthQwordTimeFile = '../data/pickle_data/KDE/' + year + '/qword_time/band_qword' + str(topN) + '_' + year + '.pkl' kdeQwordTimeFile = '../data/pickle_data/KDE/' + year + '/qword_time/kde_qword' + str(topN) + '_' + year + '.pkl' topNResults = getTopNResults(resultFile, topN) queriesEpoch = getQueriesEpoch(queryTimeFile, year) tweetsEpoch = getPickleData(tweetsEpochFile) stopWords = stopWordsGet(stopFilePath) queriesDict = getProcessedQueries(queryTimeFile, stopWords) wordsIndex = getPickleData(indexedFile) (bandwidthDict, kdeDict) = qwordTimeKDE(topNResults, queriesEpoch, tweetsEpoch, queriesDict, wordsIndex) writePickleData(bandwidthDict, bandwidthQwordTimeFile) writePickleData(kdeDict, kdeQwordTimeFile)
hanFuYinDict[term] = jiebaPosDict[termJian] predictedCount += 1 print 'pos predicted count for FuYin terms: ' + str(predictedCount) print 'total count of FuYin terms: ' + str(len(hanFuYinDict.keys())) percentage = 1.0 * predictedCount / len(hanFuYinDict.keys()) print 'predicted percentage: ' + str(percentage) return hanFuYinDict if __name__=='__main__': #-----------------------------------------汉语大词典(复音词词典)------------------------------------- hanFuYinFilePath = '../data/自动标注材料/词典/汉语大词典(复音词词典)/汉语大词典(第一版)词头.txt' label = '复音词' hanFuYinDict = getHanFuYinTerms(hanFuYinFilePath, label) jiebaDictFile = '../data/result/jieba/dict.txt' jiebaPosDict = getJiebaPosDict(jiebaDictFile) hanFuYinDict = predictHanFunYinPos(hanFuYinDict, jiebaPosDict) hanFuYinDictPredictFile = '../data/result/hanFuYin_PosDict_predict.pkl' writePickleData(hanFuYinDict, hanFuYinDictPredictFile)
# label = '复音词' # hanFuYinDict = getHanFuYinTerms(hanFuYinFilePath, label) hanFuYinDictPredictFile = '../data/result/hanFuYin_PosDict_predict.pkl' hanFuYinDict = getPickleData(hanFuYinDictPredictFile) #-----------------------------------------秦简库上网(单音1)------------------------------------- qinJianDanYinFilePath = '../data/自动标注材料/词典/秦简库上网(单音词词性标注1)/单音词词性.txt' qinJianDanYinDict = getQinJianDanYinTerms(qinJianDanYinFilePath) #-----------------------------------------古汉语常用字字典(单音2)------------------------------------- hanDanYinFilePath = '../data/自动标注材料/词典/古汉语常用字字典(单音词词性标注2)/单字词性表.txt' hanDanYinDict = getHanDanYinTerms(hanDanYinFilePath) hanDanYinDictU = unifyPosNames(hanDanYinDict, posNamesMap) mergedDanYinPosDict = merge_qinJianDanYin_hanDanYin(qinJianDanYinDict, hanDanYinDictU) mergedPosDict = mergeAllPosDictionary(hanClassifiedDictList, hanFuYinDict, mergedDanYinPosDict) # termPosFile = '../data/result/mergedPosDictionary.txt' # writeTermPosDict(mergedPosDict, termPosFile) termPosPickleFile = '../data/result/mergedPosDictionary.pkl' writePickleData(mergedPosDict, termPosPickleFile) # data = getPickleData(termPosPickleFile) # term = '癸未' # for pos in data[term].keys(): # print pos + ' ' + str(data[term][pos])