示例#1
0
def recalculateKeywordFrequencies():
	spidermodel.resetScoresToZero()
	for docUrl in spidermodel.allDocumentUrls():
		doc = spidermodel.getDocument(docUrl)
		addKeywordScores(doc)
	spidermodel.storeKeywordData()
	print('Keyword frequencies were recalculated.')
示例#2
0
def unvisitedUrlsWithScores(count):
	urlScores = {}
	allUrls = set(spidermodel.allDocumentUrls())
	counts = siteCounts(allUrls)
	for docUrl in allUrls:
		doc = spidermodel.getDocument(docUrl)
		score = doc.score / (counts[site(docUrl)] + 1)
		for url in doc.links:
			if url in allUrls:
				pass
			elif url in urlScores:
				urlScores[url] = max(urlScores[url], score)
			else:
				urlScores[url] = score
	#return sorted( [ScoredUrl(url, urlScores[url]) for url in urlScores if urlOk(url)] )[0:count]
	return heapq.nsmallest( 100, [ScoredUrl(url, urlScores[url]) for url in urlScores if urlOk(url)] )
示例#3
0
def linksForDocUrl(allScores,docUrl,minScore):
	doc = spidermodel.getDocument(docUrl)
	return {'url':doc.url,'score':doc.score,'links':goodlinks(allScores,doc,minScore)}
示例#4
0
def docInfo(url):
	doc = spidermodel.getDocument(url)
	wordCount = sum(doc.wordFreq.values())
	words = [wordInfo(w, doc.wordFreq[w], wordCount) for w in doc.wordFreq]
	return {'url':url,'status':doc.status,'words':words,'wordCount':wordCount}
示例#5
0
def recalculateDocumentScores():
	for docUrl in spidermodel.allDocumentUrls():
		doc = spidermodel.getDocument(docUrl)
		recalculateDocScore(doc)
		spidermodel.putDocument(doc)
	print('Document scores were recalculated.')
示例#6
0
def getScoredDocUrl(docUrl):
	doc = spidermodel.getDocument(docUrl)
	return ScoredUrl(doc.url,doc.score)