def _updateScores(self, cursor, db_document_id, text): # insert or update in table document_score db_scores = self._getScoresDict(cursor, db_document_id) doc_scores = {} # We update the document_score table only for the first # occurence of the word in the document for match in WORDS_RGX.finditer(normalizeText(text)): word = match.group(0) if word in doc_scores: continue doc_scores[word] = 0 position = match.start() if word in db_scores: if db_scores[word].position != position: db_scores[word].position = position db_scores[word].commit(cursor, update=True) else: # insert a row in the Word table if required self._ensureWordInDatabase(cursor, word) db_score = DocumentScore(db_document_id=db_document_id, word=word, position=position, download_count=0., relevance=0., popularity=0.) db_score.commit(cursor, update=False)
def _updateScores(self, cursor, db_document_id, text): # insert or update in table document_score db_scores = self._getScoresDict(cursor, db_document_id) doc_scores = {} # We update the document_score table only for the first # occurence of the word in the document for match in WORDS_RGX.finditer(normalizeText(text)): word = match.group(0) if word in doc_scores: continue doc_scores[word] = 0 position = match.start() if word in db_scores : if db_scores[word].position != position: db_scores[word].position = position db_scores[word].commit(cursor, update=True) else: # insert a row in the Word table if required self._ensureWordInDatabase(cursor, word) db_score = DocumentScore(db_document_id=db_document_id, word=word, position=position, download_count=0., relevance=0., popularity=0.) db_score.commit(cursor, update = False)
def testParseHtmlFileWithEncoding(self): filename = join(DATADIR, 'encoded.html') title, text, links, offset = self.parser.parseFile(filename, 'encoded.html', 'iso-8859-1') self.assertEquals(title, 'maille Maay') self.assertEquals(normalizeText(text), 'hello ete world this is a link and this is another link') self.assertEquals(links, ['something.com', 'somethingelse.com'])
def _selectContainingQuery(cls, words): words = [normalizeText(unicode(w)) for w in words if WORD_MIN_LEN <= len(w) <= WORD_MAX_LEN] if not words: return '' # Question: what is the HAVING clause supposed to do ? # Answer: we select all documents containing one of the words # that we are looking for, group them by their identifier, and # only keep those identifier which appeared once for each word # we were looking for. query = ("SELECT D.db_document_id, " "D.document_id, " "D.title, " "D.size, " "D.text, " "D.url, " "D.mime_type " "FROM documents D, document_scores DS " "WHERE DS.db_document_id=D.db_document_id " "AND DS.word IN (%s) " "GROUP BY DS.db_document_id " "HAVING count(DS.db_document_id) = %%s" % \ (', '.join(['%s'] * len(words)))) return query, words + [len(words)]
def testParseRaw(self): html = '<body>%s</body>' % RAW_TEXT title, text, links, offset = self.parser.parseString(html) # parseString() should return empty title when non available in the HTML self.assertEquals(title, '') self.assertEquals(normalizeText(text), RAW_TEXT.replace(u'é', 'e')) self.assertEquals(links, [])
def testParseSimpleHtml(self): title, text, links, offset = self.parser.parseString(SIMPLE_HTML) self.assertEquals(title, 'maille Maay') self.assertEquals( normalizeText(text), 'hello ete world this is a link and this is another link') self.assertEquals(links, ['something.com', 'somethingelse.com'])
def _selectContainingQuery(cls, words): words = [ normalizeText(unicode(w)) for w in words if WORD_MIN_LEN <= len(w) <= WORD_MAX_LEN ] if not words: return '' # Question: what is the HAVING clause supposed to do ? # Answer: we select all documents containing one of the words # that we are looking for, group them by their identifier, and # only keep those identifier which appeared once for each word # we were looking for. query = ("SELECT D.db_document_id, " "D.document_id, " "D.title, " "D.size, " "D.text, " "D.url, " "D.mime_type " "FROM documents D, document_scores DS " "WHERE DS.db_document_id=D.db_document_id " "AND DS.word IN (%s) " "GROUP BY DS.db_document_id " "HAVING count(DS.db_document_id) = %%s" % \ (', '.join(['%s'] * len(words)))) return query, words + [len(words)]
def render_prevset_url(self, context, data): words = WORDS_RGX.findall( normalizeText(unicode(context.arg('words'), 'utf-8'))) offset = int(context.arg('offset', 0)) if offset: offset -= 15 return 'search?words=%s&offset=%s' % ('+'.join(words), offset)
def testTitleGuess(self): """Make sure the title is the filename when we treat a text file or no title could be found """ title, text, links, offset = self.parser.parseFile(join(DATADIR, "notitle.html"), 'notitle.html') self.assertEquals(title, 'notitle.html') self.assertEquals(normalizeText(text), "maille maay") self.assertEquals(links, [])
def findDocuments(self, query): """Find all indexed documents matching the query""" words = WORDS_RGX.findall(normalizeText(query)) self._updateQueryStatistics(words) try: cursor = self._cnx.cursor() return Document.selectContaining(cursor, words) finally: cursor.close()
def testParseHtmlFileWithEncoding(self): filename = join(DATADIR, 'encoded.html') title, text, links, offset = self.parser.parseFile( filename, 'encoded.html', 'iso-8859-1') self.assertEquals(title, 'maille Maay') self.assertEquals( normalizeText(text), 'hello ete world this is a link and this is another link') self.assertEquals(links, ['something.com', 'somethingelse.com'])
def testTitleGuess(self): """Make sure the title is the filename when we treat a text file or no title could be found """ title, text, links, offset = self.parser.parseFile( join(DATADIR, "notitle.html"), 'notitle.html') self.assertEquals(title, 'notitle.html') self.assertEquals(normalizeText(text), "maille maay") self.assertEquals(links, [])
def findDocuments(self, query): """Find all indexed documents matching the query""" # TODO: order results using document_scores information words = WORDS_RGX.findall(normalizeText(unicode(query.words))) self._updateQueryStatistics(words) try: cursor = self._cnx.cursor() return Document.selectContaining(cursor, words, query.filetype, query.offset, self.searchInPrivate) finally: cursor.close()
def notifyDownload(self, db_document_id, query): words = WORDS_RGX.findall(normalizeText(query)) try: try: cursor = self._cnx.cursor() doc = Document.selectWhere(cursor, db_document_id=db_document_id)[0] finally: cursor.close() self._updateDownloadStatistics(doc, words) return doc.url except IndexError: return ''
def testTitleGuess(self): #XXX: complete this with PDF/PS files before commit time !!! """Make sure the title is the filename when we treat a text file or no title could be found """ title, text, links, offset = self.parser.parseFile(join(DATADIR, 'latin1.txt'), 'latin1.txt', 'ISO-8859-1') self.assertEquals(title, 'latin1.txt') self.assertEquals(normalizeText(text), "c'est l'ete") self.assertEquals(links, []) # Now, PS file title, text, links, offset = self.parser.parseFile(join(DATADIR, 'utf8.ps'), 'utf8.ps', 'UTF-8') self.assertEquals(title, 'utf8.ps') self.assertEquals(links, []) # The PDF (yes, it's important to test this too) title, text, links, offset = self.parser.parseFile(join(DATADIR, 'utf8.pdf'), 'utf8.pdf', 'UTF-8') self.assertEquals(title, 'utf8.pdf') self.assertEquals(links, [])
def _selectContainingQuery(cls, words, mimetype=None, offset=0, allowPrivate=False): words = [ normalizeText(unicode(w)) for w in words if WORD_MIN_LEN <= len(w) <= WORD_MAX_LEN ] # XXX mimetype handling is a HACK. It needs to be integrated # nicely in order to handle any kind of restrictions easily if mimetype is not None: restriction = " AND D.mime_type=%s " restrictionParams = [unicode(mimetype)] else: restriction = "" restrictionParams = [] if not allowPrivate: restriction += " AND D.state!=%s " restrictionParams.append(cls.PRIVATE_STATE) # Question: what is the HAVING clause supposed to do ? # Answer: we select all documents containing one of the words # that we are looking for, group them by their identifier, and # only keep those identifiers which appeared once for each word # we were looking for. # XXX: LIMIT clause should be optional query = ("SELECT D.db_document_id, " "D.document_id, " "D.title, " "D.size, " "D.text, " "D.url, " "D.mime_type, " "D.publication_time " "FROM documents D, document_scores DS " "WHERE DS.db_document_id=D.db_document_id " "AND DS.word IN (%s) " " %s " "GROUP BY DS.db_document_id " "HAVING count(DS.db_document_id) = %%s " "ORDER BY D.publication_time DESC " "LIMIT 15 OFFSET %s" % \ (', '.join(['%s'] * len(words)), restriction, offset)) return query, words + restrictionParams + [len(words)]
def testTitleGuess( self ): #XXX: complete this with PDF/PS files before commit time !!! """Make sure the title is the filename when we treat a text file or no title could be found """ title, text, links, offset = self.parser.parseFile( join(DATADIR, 'latin1.txt'), 'latin1.txt', 'ISO-8859-1') self.assertEquals(title, 'latin1.txt') self.assertEquals(normalizeText(text), "c'est l'ete") self.assertEquals(links, []) # Now, PS file title, text, links, offset = self.parser.parseFile( join(DATADIR, 'utf8.ps'), 'utf8.ps', 'UTF-8') self.assertEquals(title, 'utf8.ps') self.assertEquals(links, []) # The PDF (yes, it's important to test this too) title, text, links, offset = self.parser.parseFile( join(DATADIR, 'utf8.pdf'), 'utf8.pdf', 'UTF-8') self.assertEquals(title, 'utf8.pdf') self.assertEquals(links, [])
def _selectContainingQuery(cls, words, mimetype=None, offset=0, allowPrivate=False): words = [normalizeText(unicode(w)) for w in words if WORD_MIN_LEN <= len(w) <= WORD_MAX_LEN] # XXX mimetype handling is a HACK. It needs to be integrated # nicely in order to handle any kind of restrictions easily if mimetype is not None: restriction = " AND D.mime_type=%s " restrictionParams = [unicode(mimetype)] else: restriction = "" restrictionParams = [] if not allowPrivate: restriction += " AND D.state!=%s " restrictionParams.append(cls.PRIVATE_STATE) # Question: what is the HAVING clause supposed to do ? # Answer: we select all documents containing one of the words # that we are looking for, group them by their identifier, and # only keep those identifiers which appeared once for each word # we were looking for. # XXX: LIMIT clause should be optional query = ( "SELECT D.db_document_id, " "D.document_id, " "D.title, " "D.size, " "D.text, " "D.url, " "D.mime_type, " "D.publication_time " "FROM documents D, document_scores DS " "WHERE DS.db_document_id=D.db_document_id " "AND DS.word IN (%s) " " %s " "GROUP BY DS.db_document_id " "HAVING count(DS.db_document_id) = %%s " "ORDER BY D.publication_time DESC " "LIMIT 15 OFFSET %s" % (", ".join(["%s"] * len(words)), restriction, offset) ) return query, words + restrictionParams + [len(words)]
def testNormalizeText(self): text = u"À Paris,\t\x02l'été \nsera chaud" norm = normalizeText(text) self.assertEquals(u"a paris, l'ete sera chaud", norm) self.assertEquals(unicode, type(norm))
def testParseSimpleHtml(self): title, text, links, offset = self.parser.parseString(SIMPLE_HTML) self.assertEquals(title, 'maille Maay') self.assertEquals(normalizeText(text), 'hello ete world this is a link and this is another link') self.assertEquals(links, ['something.com', 'somethingelse.com'])
def render_nextset_url(self, context, data): words = WORDS_RGX.findall(normalizeText(unicode(context.arg('words'), 'utf-8'))) offset = int(context.arg('offset', 0)) + 15 return 'search?words=%s&offset=%s' % ('+'.join(words), offset)