def compareLists(query, relevant_positional_index, possible_document_matches, db):
	global positional_index
	positional_index = relevant_positional_index

	# songs that contain the sample from the query passed by the user
	sampled_songs = dict()

	# Searching through all of the documents with every word in the query to see if the words come one after another
	for document in possible_document_matches:
		
		# word: 1{20, 40, 67} == gives you [20, 40, 67]
		max_substring_length = 1
		for index, word in enumerate(query):

			if max_substring_length >= len(query[index:]):
				break

			for position in positional_index[word]['document_dict'][document]:
				# calling a recursive method to see if the song actually contains the query
				substring_length_from_n = detectSample(position, query[index+1:], document, 1)

				if substring_length_from_n > max_substring_length:
					max_substring_length = substring_length_from_n
					if max_substring_length == len(query):
						print dBDelegate.getSongTitle(db, document)
			
		# if the song does contain the query, add the document name to a list
		if max_substring_length > len(query)*.25:
			max_substring_length = max_substring_length * .5
			if max_substring_length in sampled_songs:
				sampled_songs[max_substring_length].append(document)
			else:
				sampled_songs[max_substring_length] = [document]
			
	return sampled_songs
Exemplo n.º 2
0
def sortTfidfValues(tfidf_values):
	print "****************************************************************"
	print "Top 20 TFIDF scores"
	sorted_tfidf = sorted(tfidf_values.items(), key=lambda x: (-x[1], x[0]))

	top_10_values = itertools.islice(sorted_tfidf, 0, 20)
	for song, tfidf in top_10_values:
		print dBDelegate.getSongTitle(db, song), tfidf