Exemplo n.º 1
0
 def daatQuery(self, words):
     queryResult = []
     urlTable = URLTable()
     num = len(words)
     print "num of lists:", num
     lp = []
     openListStart = time.clock()
     for i in range(num):
         lp.append(self.openList(words[i]))
     print "time to openlist:", str(time.clock() - openListStart)
     sortStart = time.clock()
     bm25Time = 0
     lp.sort(key=attrgetter('size'))
     # for i in range(1,len(lp)):
     #     if
     did = 0
     print "time sorting list:", str(time.clock() - sortStart)
     nextGEQTime=0
     while did < config.MAXDID:
         nGEQStart=time.clock()
         did = self.nextGEQ(lp[0], did)
         nextGEQTime+=time.clock()-nGEQStart
         if did == config.MAXDID:
             break
         d = None
         for i in range(1, num):
             nGEQStart=time.clock()
             d = self.nextGEQ(lp[i], did)
             nextGEQTime+=time.clock()-nGEQStart
             if d != did:
                 break
         if d is not None and d > did:
             did = d
         else:
             resultItem = ResultItem()
             resultItem.docID = did
             resultItem.url = urlTable[did].url
             score = 0
             bm25Start = time.clock()
             for i in range(num):
                 freq = self.getFreq(lp[i], did)
                 score += bm25.getBM25(freq, lp[i].size, urlTable.N, urlTable[did].dl, urlTable.avgdl)
                 pos = self.getPos(lp[i], did)
                 resultItem.pos.append(pos)
             bm25Time += time.clock() - bm25Start
             resultItem.bm25 = score
             resultItem.score = resultItem.bm25
             queryResult.append(resultItem)
             did += 1
     print "total time nextGEQ:",nextGEQTime
     print "total time bm25: ", bm25Time
     return queryResult
Exemplo n.º 2
0
	def queryWords(self, query, start, limit):		
		lexiconTable = LexiconTable()
		urlTable = URLTable()
		words = self.parseQuery(query)
		begin = time.clock()
		sets = []
		indexMap = {}
		for word in words:
			index = self.getIndex(word)
			docset = set([])
			if index is not None:
				docset = set(index.keys())
			indexMap[word] = index
			sets.append(docset)
		resultset = set.intersection(*sets)
		print "get result set time:",str(time.clock()-begin)
		#pagerank = PageRank()
		resultSize = len(resultset)
		print "result size:",resultSize
		begin = time.clock()
		if start > resultSize-1:
			return []
		queryResult = []
		for docID in resultset:
			resultItem = ResultItem()
			resultItem.docID = docID
			resultItem.url = urlTable[docID].url
			bm25Score = 0
			for word in words:
				bm25Score += bm25.getBM25(indexMap[word][docID].occurence, lexiconTable[word.lower()].occurence, urlTable.N, urlTable[docID].pagesize, urlTable.avgdl)
			resultItem.bm25 = bm25Score
			resultItem.score = resultItem.bm25
			queryResult.append(resultItem)
		print "BM25 time:",str(time.clock()-begin)
		begin = time.clock()
		queryResult = sorted(queryResult, key=attrgetter('score'), reverse=True)
		print "sort BM25 time:",str(time.clock()-begin)
		begin = time.clock()
		startIndex = 0 if start < 0 else start
		endIndex = startIndex + limit
		endIndex = resultSize if endIndex > resultSize else endIndex
		print "start index & end index:", startIndex, endIndex
		queryResult = queryResult[startIndex:endIndex]
		for item in queryResult:
			item.snippet = cgi.escape(self.getSnippet(item.docID, item.url, words))
		print "snippet time:",str(time.clock()-begin)
		return queryResult, resultSize