Пример #1
0
        text = entry[1]
        words = text.split()
        for word in words:
            wordsIndex.setdefault(word, {})
            wordsIndex[word].setdefault(docId, 0)
            wordsIndex[word][docId] += 1
        i += 1
        if i % 10000 == 0:
            print i
    
    print 'total documents indexed: ' + str(i)        
    wordsCount = len(wordsIndex.keys()) 
    print 'total words: ' + str(wordsCount)     
    return wordsIndex



if __name__=='__main__':
    year = '2012'
#     filePath = 'E:\\eclipse\\QueryExpansion\\dataset\\processed\\test.txt'
    filePath = 'E:\\eclipse\\QueryExpansion\\dataset\\processed\\' + 'tweet' + year + '_processed.txt'
    indexedFile = 'E:\\eclipse\\TemporalRetrieval\\data\\pickle_data\\index\\' + 'tweet_index_' + year + '.pkl'
    
    wordsIndex = invertedIndexing(filePath)
    writePickleData(wordsIndex, indexedFile)
    
    



            
Пример #2
0
        print qid + ', best bandwidth: ' +  str(bandwidth)        
    return (bandwidthDict, kdeDict)


  


if __name__=='__main__' :
    year = '2012'
    topNList = [i for i in range(50, 501, 50)]
    
    for topN in topNList:
        queryTimeFile = 'E:\\eclipse\\QueryExpansion\\data\\QueryTime\\' + year + '.MBid_query_time.txt'
        tweetsEpochFile = 'E:\\eclipse\\TemporalRetrieval\\data\\pickle_data\\tweetsEpoch\\tweetsEpoch_'+ year + '.pkl'
        resultFile = 'E:\\eclipse\\QueryExpansion\\data\\BM25\\BM25_' + year + '.txt'
        bandwidthPrfTimeFile = 'E:\\eclipse\\TemporalRetrieval\\data\\pickle_data\\KDE\\' + year + '\\band_prf' + str(topN) +'_' + year + '.pkl' 
        kdePrfTimeFile ='E:\\eclipse\\TemporalRetrieval\\data\\pickle_data\\KDE\\' + year + '\\kde_prf' + str(topN) +'_' + year + '.pkl' 
        
        # train the kde estimator
        topNResults = getTopNResults(resultFile, topN)
        queriesEpoch = getQueriesEpoch(queryTimeFile, year)
        tweetsEpoch = getPickleData(tweetsEpochFile)
        (bandwidthDict, kdeDict) = prfTimeKDE(topNResults, queriesEpoch, tweetsEpoch)
        writePickleData(bandwidthDict, bandwidthPrfTimeFile)
        writePickleData(kdeDict, kdePrfTimeFile)
    
    



Пример #3
0
    stopFilePath = 'E:\\eclipse\\QueryExpansion\\data\\english.stop'
    indexedFile = 'E:\\eclipse\\TemporalRetrieval\\data\\pickle_data\\index\\' + 'tweet_index_' + year + '.pkl'
    queryTimeFile = 'E:\\eclipse\\QueryExpansion\\data\\QueryTime\\' + year + '.MBid_query_time.txt'
    tweetsEpochFile = 'E:\\eclipse\\TemporalRetrieval\\data\\pickle_data\\tweetsEpoch\\tweetsEpoch_'+ year + '.pkl'
    resultFile = 'E:\\eclipse\\QueryExpansion\\data\\BM25\\BM25_' + year + '.txt'
    bandwidthQwordTimeFile = '../data/pickle_data/KDE/' + year + '/qword_time/band_qword' + str(topN) + '_' + year + '.pkl'
    kdeQwordTimeFile = '../data/pickle_data/KDE/' + year + '/qword_time/kde_qword' + str(topN) + '_' + year + '.pkl'
    
    topNResults = getTopNResults(resultFile, topN)
    queriesEpoch = getQueriesEpoch(queryTimeFile, year)
    tweetsEpoch = getPickleData(tweetsEpochFile)
    stopWords = stopWordsGet(stopFilePath)
    queriesDict = getProcessedQueries(queryTimeFile, stopWords)
    wordsIndex = getPickleData(indexedFile)
    (bandwidthDict, kdeDict) = qwordTimeKDE(topNResults, queriesEpoch, tweetsEpoch, queriesDict, wordsIndex)
    writePickleData(bandwidthDict, bandwidthQwordTimeFile)
    writePickleData(kdeDict, kdeQwordTimeFile)
    
    

       
            
    
    
    
        




            hanFuYinDict[term] = jiebaPosDict[termJian]
            predictedCount += 1
    print 'pos predicted count for FuYin terms: ' + str(predictedCount)
    print 'total count of FuYin terms: ' + str(len(hanFuYinDict.keys()))
    percentage = 1.0 * predictedCount / len(hanFuYinDict.keys())
    print 'predicted percentage: ' + str(percentage)
    return hanFuYinDict




if __name__=='__main__':  
    #-----------------------------------------汉语大词典(复音词词典)------------------------------------- 
    hanFuYinFilePath = '../data/自动标注材料/词典/汉语大词典(复音词词典)/汉语大词典(第一版)词头.txt'
    label = '复音词'
    hanFuYinDict =  getHanFuYinTerms(hanFuYinFilePath, label)
    jiebaDictFile = '../data/result/jieba/dict.txt'
    jiebaPosDict = getJiebaPosDict(jiebaDictFile) 
    hanFuYinDict = predictHanFunYinPos(hanFuYinDict, jiebaPosDict)
    
    hanFuYinDictPredictFile = '../data/result/hanFuYin_PosDict_predict.pkl'
    writePickleData(hanFuYinDict, hanFuYinDictPredictFile) 
    
    
    
    
    
    
    

#     label = '复音词'
#     hanFuYinDict =  getHanFuYinTerms(hanFuYinFilePath, label) 
    
    hanFuYinDictPredictFile = '../data/result/hanFuYin_PosDict_predict.pkl'
    hanFuYinDict = getPickleData(hanFuYinDictPredictFile)
    
    #-----------------------------------------秦简库上网(单音1)-------------------------------------  
    qinJianDanYinFilePath = '../data/自动标注材料/词典/秦简库上网(单音词词性标注1)/单音词词性.txt'
    qinJianDanYinDict = getQinJianDanYinTerms(qinJianDanYinFilePath)
    
    #-----------------------------------------古汉语常用字字典(单音2)-------------------------------------  
    hanDanYinFilePath = '../data/自动标注材料/词典/古汉语常用字字典(单音词词性标注2)/单字词性表.txt'
    hanDanYinDict = getHanDanYinTerms(hanDanYinFilePath) 
    hanDanYinDictU = unifyPosNames(hanDanYinDict, posNamesMap)
     
    mergedDanYinPosDict = merge_qinJianDanYin_hanDanYin(qinJianDanYinDict, hanDanYinDictU)
    mergedPosDict = mergeAllPosDictionary(hanClassifiedDictList, hanFuYinDict, mergedDanYinPosDict)
#     termPosFile = '../data/result/mergedPosDictionary.txt'
#     writeTermPosDict(mergedPosDict, termPosFile)
    
    termPosPickleFile = '../data/result/mergedPosDictionary.pkl' 
    writePickleData(mergedPosDict, termPosPickleFile)  
#     data = getPickleData(termPosPickleFile) 
#     term = '癸未'
#     for pos in data[term].keys():      
#         print pos + ' ' + str(data[term][pos])