def recalculateKeywordFrequencies(): spidermodel.resetScoresToZero() for docUrl in spidermodel.allDocumentUrls(): doc = spidermodel.getDocument(docUrl) addKeywordScores(doc) spidermodel.storeKeywordData() print('Keyword frequencies were recalculated.')
def unvisitedUrlsWithScores(count): urlScores = {} allUrls = set(spidermodel.allDocumentUrls()) counts = siteCounts(allUrls) for docUrl in allUrls: doc = spidermodel.getDocument(docUrl) score = doc.score / (counts[site(docUrl)] + 1) for url in doc.links: if url in allUrls: pass elif url in urlScores: urlScores[url] = max(urlScores[url], score) else: urlScores[url] = score #return sorted( [ScoredUrl(url, urlScores[url]) for url in urlScores if urlOk(url)] )[0:count] return heapq.nsmallest( 100, [ScoredUrl(url, urlScores[url]) for url in urlScores if urlOk(url)] )
def linksForDocUrl(allScores,docUrl,minScore): doc = spidermodel.getDocument(docUrl) return {'url':doc.url,'score':doc.score,'links':goodlinks(allScores,doc,minScore)}
def docInfo(url): doc = spidermodel.getDocument(url) wordCount = sum(doc.wordFreq.values()) words = [wordInfo(w, doc.wordFreq[w], wordCount) for w in doc.wordFreq] return {'url':url,'status':doc.status,'words':words,'wordCount':wordCount}
def recalculateDocumentScores(): for docUrl in spidermodel.allDocumentUrls(): doc = spidermodel.getDocument(docUrl) recalculateDocScore(doc) spidermodel.putDocument(doc) print('Document scores were recalculated.')
def getScoredDocUrl(docUrl): doc = spidermodel.getDocument(docUrl) return ScoredUrl(doc.url,doc.score)