class TestSimpleDBIndex(unittest.TestCase): def setUp(self): self.sdbindex = SimpleDBIndex(domainprefix="atbroxtest") def testGetTermLine(self): i = 1+13*self.sdbindex.MAXITEMSIZE origterm = "gettermline" term, termLine = self.sdbindex._getTermLine(origterm,i) self.assertEquals(origterm + self.sdbindex.ITEMINDEXSEPARATOR + str(13), term) def testTermLineSize(self): termLine = {"termlinesize":"456"} self.assertEquals(15, self.sdbindex._termLineSize(termLine)) def testFlushCache(self): when(self.sdbindex.sdb).batch_put_attributes(any(str),any()).thenReturn("") cacheentry = ("testflushcache", {"1":"abc", "2":"def"}) self.sdbindex.batchcache.append(cacheentry) self.sdbindex._flushcache() self.assertEquals(None, verify(self.sdbindex.sdb, times=1).batch_put_attributes(any(str),any())) def testStore(self): when(self.sdbindex.sdb).batch_put_attributes(any(str),any()).thenReturn("") term = "teststoreterm" # trigger autoflushing termLine = {"0":"a"*(self.sdbindex.MAXITEMSIZE/10)} # and a forced flush with flush=True self.sdbindex._store(term,termLine, flush = True) # check that it was flushed twice self.assertEquals(None, verify(self.sdbindex.sdb, times=1).batch_put_attributes(any(str),any())) def testAddInvertedFileEntry(self): self.sdbindex._warningthisdeletesallsimpledbdomains() self.sdbindex._createsimpledbdomains() term = "termtoput" vector = "thistermvectorshouldbeputandstored" self.sdbindex.addInvertedFileEntry(term, vector) self.assertTrue("termtoput" in self.sdbindex.domain.keys()) self.assertEquals(1, len(self.sdbindex.domain.keys())) #print >> sys.stderr, self.sdbindex.domain.keys() def testGetInvertedFileEntry(self): self.sdbindex._warningthisdeletesallsimpledbdomains() self.sdbindex._createsimpledbdomains() term = "termtoget" putvector = "valuetoget" self.sdbindex.addInvertedFileEntry(term, putvector) self.sdbindex._flushcache() time.sleep(4) getvector = self.sdbindex.getInvertedFileEntry(term) #print >> sys.stderr, "getvector = ", getvector self.assertEquals(getvector, putvector) def testAddAndHashUrl(self): self.sdbindex._warningthisdeletesallsimpledbdomains() self.sdbindex._createsimpledbdomains() url = "http://atbrox.com" urlhash = self.sdbindex.addAndHashUrl(url) self.assertEquals(str(url.__hash__()), urlhash) def testGetUrl(self): self.sdbindex._warningthisdeletesallsimpledbdomains() self.sdbindex._createsimpledbdomains() url = "http://www.atbrox.com" urlhash = self.sdbindex.addAndHashUrl(url) self.assertEquals(url, self.sdbindex.getUrl(urlhash))
class SimpleDBSearch: def __init__(self): self.sent_detector = nltk.data.load('tokenizers/punkt/english.pickle') self.termindex = {} self.sdbindex = SimpleDBIndex() self.PAGESEPARATOR = """&""" # safe since only numbers used self.POSITIONSEPARATOR="""|""" # safe since only numbers separating self.CLEANREGEXP = re.compile(r"(\(|\)|\,|\:|\;)") def _getSentences(self, document): return self.sent_detector.tokenize(document) def _getAllTermsInOrder(self, document): allterms = [] sentences = self._getSentences(document) for sentence in sentences: allterms += sentence.split() return allterms def _getTermsWithPositions(self, document): allterms = self._getAllTermsInOrder(document) termswithpos = {} for pos, term in enumerate(allterms): indexterm = self.CLEANREGEXP.sub("", term).lower().strip() indexterm.lower().strip() if indexterm != "": termswithpos[indexterm] = termswithpos.get(indexterm,[]) + [str(pos)] return termswithpos def index(self, url, document): urlhash = self.sdbindex.addAndHashUrl(url) termswithpositions = self._getTermsWithPositions(document) for term in termswithpositions: # first item is the hash of the url, rest is self.termindex[term] = self.termindex.get(term,"") self.termindex[term] += self.PAGESEPARATOR self.termindex[term] += self.POSITIONSEPARATOR.join([urlhash] + termswithpositions[term]) return self.termindex def writeIndexToSDB(self): i = 0 numterms = len(self.termindex.keys()) for term in self.termindex: print i, " of ", numterms, " adding tv for term: '%s' to SDB" % (term) self.sdbindex.addInvertedFileEntry(term, self.termindex[term]) i += 1 print "flushing cache" self.sdbindex._flushcache() def extractUrlHashListFromInvertedFileEntry(self, invertedFileEntry): pages = invertedFileEntry.split(self.PAGESEPARATOR) # TODO: extract positions urlhashlist = [] for page in pages: if page == "": continue urlhashlist += [page.split(self.POSITIONSEPARATOR)[0]] urlhashlist.sort() return urlhashlist def query(self, query): urlhashforterms = {} terms = query.split() for term in terms: t0 = time.time() invertedFileEntry = self.sdbindex.getInvertedFileEntry(term) t1 = time.time() print "fetchtime|%f" % (t1-t0) #print invertedFileEntry urlhashes = self.extractUrlHashListFromInvertedFileEntry(invertedFileEntry) #print urlhashes for urlhash in urlhashes: urlhashforterms[urlhash] = urlhashforterms.get(urlhash, []) + [term] results = ((len(urlhashforterms[urlhash]), urlhash) for urlhash in urlhashforterms) #print "raw inverted file vector results: " #print results #print "urls with matches" results = [] for matches, urlhash in results: # TODO: look up url with urlhash #print self.sdbindex.getUrl(urlhash) results.append(self.sdbindx.getUrl(urlhash)) return results