def _updateScores(self, cursor, db_document_id, text): # insert or update in table document_score db_scores = self._getScoresDict(cursor, db_document_id) doc_scores = {} # We update the document_score table only for the first # occurence of the word in the document for match in WORDS_RGX.finditer(normalizeText(text)): word = match.group(0) if word in doc_scores: continue doc_scores[word] = 0 position = match.start() if word in db_scores: if db_scores[word].position != position: db_scores[word].position = position db_scores[word].commit(cursor, update=True) else: # insert a row in the Word table if required self._ensureWordInDatabase(cursor, word) db_score = DocumentScore(db_document_id=db_document_id, word=word, position=position, download_count=0., relevance=0., popularity=0.) db_score.commit(cursor, update=False)
def _updateScores(self, cursor, db_document_id, text): # insert or update in table document_score db_scores = self._getScoresDict(cursor, db_document_id) doc_scores = {} # We update the document_score table only for the first # occurence of the word in the document for match in WORDS_RGX.finditer(normalizeText(text)): word = match.group(0) if word in doc_scores: continue doc_scores[word] = 0 position = match.start() if word in db_scores : if db_scores[word].position != position: db_scores[word].position = position db_scores[word].commit(cursor, update=True) else: # insert a row in the Word table if required self._ensureWordInDatabase(cursor, word) db_score = DocumentScore(db_document_id=db_document_id, word=word, position=position, download_count=0., relevance=0., popularity=0.) db_score.commit(cursor, update = False)
def _updateDownloadStatistics(self, document, words): cursor = self._cnx.cursor() document.download_count = max(0, document.download_count) + 1 document.commit(cursor, update=True) db_document_id = document.db_document_id scores = {} wordInfo = {} for word in words: scores[word] = DocumentScore.selectOrInsertWhere( cursor, db_document_id=db_document_id, word=word)[0] wordInfo[word] = Word.selectOrInsertWhere(cursor, word=word)[0] for winfo in wordInfo.itervalues(): winfo.download_count += 1 / len(words) winfo.commit(cursor, update=True) for word, score in scores.iteritems(): score.download_count = max(0, score.download_count) + 1 / len(words) winfo_downloads = wordInfo[word].download_count score.popularity = score.download_count / winfo_downloads score.popularity -= hoeffding_deviation(winfo_downloads) score.relevance = score.download_count / document.download_count score.relevance -= hoeffding_deviation(document.download_count) score.commit(cursor, update=True) cursor.close() self._cnx.commit()
def _updateDownloadStatistics(self, document, words): cursor = self._cnx.cursor() document.download_count = max(0, document.download_count) + 1 document.commit(cursor, update=True) db_document_id = document.db_document_id scores = {} wordInfo = {} for word in words: scores[word] = DocumentScore.selectOrInsertWhere(cursor, db_document_id=db_document_id, word=word)[0] wordInfo[word] = Word.selectOrInsertWhere(cursor, word=word)[0] for winfo in wordInfo.itervalues(): winfo.download_count += 1 / len(words) winfo.commit(cursor, update=True) for word,score in scores.iteritems(): score.download_count = max(0, score.download_count) + 1.0 / len(words) winfo_downloads = wordInfo[word].download_count score.popularity = float(score.download_count) / winfo_downloads score.popularity -= hoeffding_deviation(winfo_downloads) score.popularity = max(1e-6, score.popularity) score.relevance = float(score.download_count) / document.download_count score.relevance -= hoeffding_deviation(document.download_count) score.relevance = max(1e-6, score.relevance) score.commit(cursor, update=True) cursor.close() self._cnx.commit()
def _getScoresDict(self, cursor, db_document_id): _scores = DocumentScore.selectWhere(cursor, db_document_id=db_document_id) db_scores = {} while _scores: score = _scores.pop() db_scores[score.word] = score return db_scores
def _getScoresDict(self, cursor, db_document_id): _scores = DocumentScore.selectWhere(cursor, db_document_id=db_document_id) db_scores = {} while _scores: score = _scores.pop() db_scores[score.word] = score return db_scores