Пример #1
0
	def search(searchTerm):
		starttime = time.time()
		print ">>SEARCHARTICLES: Search for top TF-IDF values has started."

		
		resultspath = os.getcwd() + "/TFIDF_searcher/articleresults.txt"

		totaldoccount = len(Corpus.articleIndex)

		results = []

		term = str(searchTerm).strip()
		# term = lemmatise_input_term(term)
		if term in Corpus.wordIndex:
			articlehits = Corpus.wordIndex[term]
			doccount = len(articlehits)
			IDF = math.log10(totaldoccount / float(doccount))

			for article in articlehits:
				wordcount = articlehits[article]
				articlewordcount = Corpus.articleIndex[article]
				TF = wordcount / float(articlewordcount)
				TFIDF = TF * IDF
				results.append((article, TFIDF))
				# print "TFIDF: %s * %s = %s" % (TF, IDF, TFIDF)
		else:
			print ">>SEARCHARTICLES: '%s' 0 articles. Search terminated." %term
			return results
		if len(results) < 2:
			print ">>SEARCHARTICLES: '%s' too few articles. Search terminated." %term
			results = []
			return results

		# Sort results on their TFIDF rating, in decreasing order.
		results = sorted(results, key=lambda result: result[1], reverse=True)
		print ">>SEARCHARTICLES: '%s' \t %s articles." % (term, len(results))

		# Deletes the bottom 50% of our search results
		results = results[0:len(results)/2]

		for x in results:
			print x[0]
		
		# resultfile = open(resultspath, "w")
		# for result in results:
		#  	resultfile.write(result[0] + "\n")
		# 	for articles in result[1]:
		# 		resultfile.write(str(articles[0]) + " : " + str(articles[1]) + "\n")
		# resultfile.close()
		totalTime = round((time.time() - starttime), 3)

		print ">>SEARCHARTICLES: Search has completed in %s seconds." % totalTime
		searchLog(term, len(results), totalTime)
		
		return results
Пример #2
0
	def search(self, searchTerm):
		starttime = time.time()
		totaldoccount = len(self.articleIndex)
		results = []

		term = str(searchTerm).strip()
		# term = lemmatise_input_term(term)
		if term in self.wordIndex:
			articlehits = self.wordIndex[term]
			doccount = len(articlehits)
			IDF = math.log10(totaldoccount / float(doccount))

			for article in articlehits:
				wordcount = articlehits[article]
				articlewordcount = self.articleIndex[article][0]
				TF = wordcount / float(articlewordcount)
				TFIDF = TF * IDF
				results.append((article, TFIDF))
				# print "TFIDF: %s * %s = %s" % (TF, IDF, TFIDF)
		else:
			print ">>SEARCHARTICLES: '%s' has 0 articles. Search terminated." %term
			return results
		if len(results) < 2:
			print ">>SEARCHARTICLES: '%s' has %s article(s). Search terminated." %(term,len(results))
			results = []
			return results

		# Sort results on their TFIDF rating, in decreasing order.
		results = sorted(results, key=lambda result: result[1], reverse=True)
		print ">>SEARCHARTICLES: '%s' has %s articles (%s returned)." % (term, len(results), (len(results) / 2))

		# Deletes the bottom 50% of our search results
		results = results[0:len(results)/2]
		self.sentimentscore = 0

		# Removes TFIDF values from the remaining articles and adds up the sentimentscore
		for i in range(len(results)):
			x = results[i]
			self.sentimentscore += self.articleIndex[x[0]][1]
			results[i] = x[0]

		totalTime = round((time.time() - starttime), 3)

#		print ">>SEARCHARTICLES: Search has completed in %s seconds." % totalTime
		searchLog(term, len(results), totalTime)
		sentimentLog(term, self.sentimentscore)
		return results
Пример #3
0
	def search(self, searchTerm):
		starttime = time.time() # logging purposes
		totaldoccount = len(self.articleIndex)
		subset = []

		term = str(searchTerm).strip()
		if term in self.wordIndex:
			articlehits = self.wordIndex[term]
			doccount = len(articlehits)
			IDF = math.log10(totaldoccount / float(doccount))

			for article in articlehits:
				wordcount = articlehits[article]
				articlewordcount = self.articleIndex[article][0]
				TF = wordcount / float(articlewordcount)
				TFIDF = TF * IDF
				subset.append((article, TFIDF))

		else:
			print ">>SEARCHARTICLES: '%s' has 0 articles. Search terminated." %term
			return subset
		if len(subset) < 2:
			print ">>SEARCHARTICLES: '%s' has %s article(s). Search terminated." %(term,len(subset))
			return []

		# Sort subset on their TFIDF rating, in decreasing order.
		subset = sorted(subset, key=lambda result: result[1], reverse=True)
		print ">>SEARCHARTICLES: '%s' has %s articles (%s returned)." % (term, len(subset), (len(subset) / 2))

		# Deletes the bottom 50% of our search subset
		subset = subset[0:len(subset)/2]
		self.sentimentscore = 0

		# Removes TFIDF values from the remaining articles and adds up the sentimentscore
		for i in range(len(subset)):
			x = subset[i]
			self.sentimentscore += self.articleIndex[x[0]][1]
			subset[i] = x[0]

		totalTime = round((time.time() - starttime), 3) # logging purposes

		print ">>SEARCHARTICLES: Search has completed in %s seconds." % totalTime
		searchLog(term, len(subset), totalTime) # logging purposes
		sentimentArticleLog(term, self.sentimentscore) # logging purposes
		return subset