Пример #1
0
	def __init__ (self, accountKey, precision, query):
		self.accountKey = accountKey
		self.precision = precision
		self.query = query
		self.internalQuery = "+".join(query.split()) # query used for URL of Bing search
		self.searcher = Web_search() # searcher for Bing
		self.results = [] # search results (top K), initialzed to empty
		self.user_feedback = [] # user responds "Y"/"N"
		self.wordIndex = defaultdict(float) # index for our ranking algorithm
		self.firstIteration = True

		# Load set of stop words
		with open(stopWordsPath, 'r') as temp:
			self.stopWords = frozenset(temp.read().split())
Пример #2
0
class User_Interface(object):

	def __init__ (self, accountKey, precision, query):
		self.accountKey = accountKey
		self.precision = precision
		self.query = query
		self.internalQuery = "+".join(query.split()) # query used for URL of Bing search
		self.searcher = Web_search() # searcher for Bing
		self.results = [] # search results (top K), initialzed to empty
		self.user_feedback = [] # user responds "Y"/"N"
		self.wordIndex = defaultdict(float) # index for our ranking algorithm
		self.firstIteration = True

		# Load set of stop words
		with open(stopWordsPath, 'r') as temp:
			self.stopWords = frozenset(temp.read().split())

	def print_search_parameter(self):
		"""
		Print parameters for search
		"""
		print "Parameters:"
		print "Client key  = "+self.accountKey
		print "Query       = "+self.query
		print "Precision   = "+str(self.precision)

	def display_search(self):
		"""
		Search Bing by the query and display search results
		"""
		# call functions in web_query.py for Bing search and XML parse
		xml_content = self.searcher.search_Bing(self.accountKey, topK, self.internalQuery)
		self.results = self.searcher.parse_XML(xml_content)

		# print URL for Bing Search
		print "URL: "+self.searcher.bingUrl
		print "Total no of results : "+str(self.searcher.results_len)
		print "Bing Search Results:"
		print "======================"

		# print each result
		self.user_feedback = []
		index = 0
		for entry in self.results:
			index = index+1

			title = entry[0]
			summary = entry[1]
			url = entry[2]
			print "Result "+str(index)+"\n[\n URL: "+url+"\n Title: "+title+"\n Summary: "+summary+"\n]\n"
			response = raw_input("Relevant (Y/N)?")
			self.user_feedback.append(response.lower())

		print "======================"

	def feedback_summary(self):
		"""
		Compute the precision by retrieved results
		Return True if more search is needed
		otherwise return False (if number of relevant results is 0, or desired precision is reached)
		"""
		print "FEEDBACK SUMMARY"
		print "Query "+self.query

		# get the number of correct results
		correct_num = 0
		for response in self.user_feedback:
			if response == 'y':
				correct_num = correct_num+1
		# get the number of total results
		total_num = len(self.results)

		# check the denominator
		if (total_num <= 0):
			print "No search results returned for the query"
			return False

		# if number of results <10, just terminate
		if (total_num < topK):
			print "Fewer than "+str(topK)+" results returned for the query"
			return False

		# precision of retrieved results
		pre = 1.0*correct_num/total_num
		print "Precision "+str(pre)

		# check if reaching the desired precision
		if self.precision <= pre:
			print "Desired precision reached, done"
			return False

		# still below the desired precision
		print "Still below the desired precision of "+str(self.precision)

		# if precision is 0, stop
		if (pre == 0.0):
			# To keep the output consistent with reference implementation
			print "Indexing results ...."
			print "Indexing results ...."
			print "Augmenting by "
			print "Below desired precision, but can no longer augment the query"
			return False
		return True

	def applyRanking(self, position, word, isTitleWord, isRelevant):
		"""
		Applies our ranking algorithm, based off Rocchio, depending on various factors,
		such as if it is a Title word, capitalized in the Summary etc.
		"""

		positionScore = positionScale[position]
		# Don't position scale the first results because we have no idea about relevance
		if (self.firstIteration):
			positionScore = 1.0
			self.firstIteration = False

		# Since it is a defaultdict, the entry will be created if it doesn't exist
		# Please read our README for details on our algorithm. The specifics are too
		# long to mention here.
		score = self.wordIndex[word.lower()]
		if isTitleWord:
			if isRelevant:
				score = score + rTitleScale * positionScore
			else:
				score = score - nrTitleScale * positionScore
		else:
			if word[0].isupper():
				if isRelevant:
					score = score + rCapSummaryScale * positionScore
				else:
					score = score - nrCapSummaryScale * positionScore
			else:
				if isRelevant:
					score = score + rSummaryScale * positionScore
				else:
					score = score - nrSummaryScale * positionScore
		self.wordIndex[word.lower()] = score
		return

	def reorderQuery(self):
		"""
		Re-order the words in the expanded query
		"""

		# constraints for the order of words, e.g., bill should be on the left of gates
		coOccurDict = defaultdict(int) # dictionary for the count of co-occurance pair
		queryWords = self.query.split()
		queryWordsLower = self.query.lower().split()

		# try all the relevant documents
		for i in range(len(self.results)):
			result = self.results[i]
			title = result[0]
			summary = result[1]

			# do only for relevant docs
			if self.user_feedback[i] != 'y':
				continue

			# Must be relevant results now
			# Remove punctuation and create lists of words
			titleWords = "".join(c for c in title if not unicodedata.category(c).startswith('P')).split()
			summaryWords = "".join(c for c in summary if not unicodedata.category(c).startswith('P')).split()

			# add co-occurance pair in title
			for i in range(len(titleWords)-1):
				currentWord = titleWords[i]
				nextWord = titleWords[i+1]

				# update coOccurDict
				updateCoOccurDict(currentWord, nextWord, coOccurDict, queryWordsLower)

			# add co-occurance pair in summary
			for i in range(len(summaryWords)-1):
				currentWord = summaryWords[i]
				nextWord = summaryWords[i+1]

				# update coOccurDict
				updateCoOccurDict(currentWord, nextWord, coOccurDict, queryWordsLower)


		# sort by the count of co-occurance pairs
		sortedByLargest = sorted(coOccurDict.iteritems(), key=operator.itemgetter(1), reverse=True)

		# add those constraints to the results for re-ordering
		results = [] # valid results for re-ordering

		for (w1, w2), count in sortedByLargest:
			# check if two words are identical
			if w1 == w2:
				continue
			# check if w2 appears in the existing results
			if w2 in results:
				# check if w2 in the beginning of existing results
				if (w2 == results[0]) and (w1 not in results):
					# append w2 in the beginning
					results.insert(0, w1)
				continue
			# check if w1 appears in the existing results
			if w1 in results:
				# check if w1 in the end of existing results
				if (w1 == results[len(results)-1]) and (w2 not in results):
					# append w2 in the end
					results.append(w2)
				continue

			# both w1 and w2 are not in existing results
			results.append(w1)
			results.append(w2)

		# add those words not in results
		for word in queryWordsLower:
			if word not in results:
				results.append(word)

		# re-write query and internal query
		self.query = " ".join(results)
		self.internalQuery = "+".join(results)


	def augmentQuery(self):
		"""
		Adds up to two new words to the query. Returns True if it could else False.
		Also, changes values to alpha*values for next iteration
		"""
		queryWords = frozenset(self.query.lower().split())
		nWordsAdded = 0

		# Sort by score of the word in index
		sortedByLargest = sorted(self.wordIndex.iteritems(), key=operator.itemgetter(1), reverse=True)
		valueOfLargest = 0.0

		# keep track of the new words augmented
		newWords = ""

		# check all the words from highest score
		for k,v in sortedByLargest:
			# filter those already in the query
			if k in queryWords:
				continue

			# Want to only add one word if the first word is overwhelmingly more relevant
			# as we do not want to push the query down a wrong track. We add a small constant
			# to v as we do not want to divide by zero.
			if nWordsAdded == 1:
				if valueOfLargest/(v+0.001) > beta:
					break

			self.query = self.query + " " + k.lower()
			self.internalQuery = self.internalQuery + "+" + k.lower()
			valueOfLargest = v
			nWordsAdded+=1
			newWords = newWords+" "+k.lower()

			if nWordsAdded == 2:
				break

		# Change scores for next iteration
		for w in self.wordIndex.iterkeys():
			self.wordIndex[w] = self.wordIndex[w] * alpha

		print "Augmenting by " + newWords

		# If we did not get a new word, then we have to stop. Very unlikely.
		if nWordsAdded > 0:
			# re-order the words in query and return True
			self.reorderQuery()
			return True
		else:
			return False

	def ranking(self):
		"""
		For each result, removes stopwords, ranks the word, augments the query
		and returns True if successful else False
		"""
		print "Indexing results ...."

		for i in range(len(self.results)):
			result = self.results[i]
			title = result[0]
			summary = result[1]

			# Remove punctuation and create lists of words
			titleWords = "".join(c for c in title if not unicodedata.category(c).startswith('P')).split()
			summaryWords = "".join(c for c in summary if not unicodedata.category(c).startswith('P')).split()

			for tw in titleWords:
				if tw.lower() in self.stopWords:
					continue
				if self.user_feedback[i] == 'y':
					self.applyRanking(i, tw, True, True)
				else:
					self.applyRanking(i, tw, True, False)

			for sw in summaryWords:
				if sw.lower() in self.stopWords:
					continue
				if self.user_feedback[i] == 'y':
					self.applyRanking(i, sw, False, True)
				else:
					self.applyRanking(i, sw, False, False)

		print "Indexing results ...."

		return self.augmentQuery()

	def runIt(self):
		"""
		The main loop for user interface
		"""
		# repeat searching until desired precision reached, or no longer augment the query
		while (True):
			self.print_search_parameter()
			self.display_search()

			# check if 0 precision or reached desired precision
			ifContinue = self.feedback_summary()
			if (ifContinue == False):
				break

			# check if we can no longer augment the query
			ifContinue = self.ranking()
			if (ifContinue == False):
				print "Below desired precision, but can no longer augment the query"
				break