def preprocess( result ): words = removePunct(result.title) words += " " words += removePunct(result.snippet) result.tokens = nltk.word_tokenize(words) for tok in result.tokens: if tok not in STOPS: tok = PorterStemmer().stem(tok.decode('utf-8')) tok = tok.lower().encode('utf-8') return result
def searchRank( query ): resList = [] # list of search result objects relList = [] # list of "indexes" of relevant results googleSearch(query, resList, 1) googleSearch(query, resList, 11) for r in resList: r = preprocess(r) # initialize tokens attribute with pre-processed words r.vector = Counter(r.tokens) print r.rank print r.title print r.url print r.snippet print # ask user which results are relevant print "Choose up to 5 results that were relevant to your search." print "Enter a negative number to quit." relNum = int(input("Enter a result number: ")) i = 0 while relNum >= 0 and i < 5: if relNum not in relList: relList.append(relNum) else: print "Error: You already entered that result" i += 1 relNum = int(input("Enter a result number ( negative to quit ): ")) # write relevant data to file infile = open(query+'.txt', 'wb') for i in relList: for r in resList: if i == r.rank: infile.write(r.title + ' ') infile.write(r.snippet + ' ') infile.close() '''--------------------pre-process our relevance test set-------------------------''' readfile = open(query+'.txt', 'rb') relWords = readfile.read() relWords = removePunct(relWords) relTokens = nltk.word_tokenize(relWords) infile = open(query+'-clean.txt', 'w') for tok in relTokens: if tok not in STOPS: tok = PorterStemmer().stem(tok.decode('utf-8')) tok = tok.lower().encode('utf-8') infile.write(tok + ' ') infile.close() '''--------------------calculate, sort, and display----------------------------------''' relevanceVector = Counter(relTokens) # get vector for relevance data to calc similarity print "Calculating relevancy of your search results......" # calculate similarity for r in resList: r.cosine = calc_cos(r.vector, relevanceVector) r.jaccard = jaccard(set(r.tokens), set(relTokens)) # print "cosine:", r.cosine # print "jaccard:", r.jaccard print "Select sorting preference:" print "[1] Jaccard Coefficient" print "[2] Cosine Similarity" print sortChoice = raw_input("Enter choice here: ") if sortChoice.lower() in ['1', 'j', 'jaccard', 'jaccard coefficient']: resList.sort(key = lambda x: x.jaccard, reverse=True) print "Showing results based on jaccard coeffecient: " elif sortChoice.lower() in ['2', 'c', 'cosine','cosine similarity']: resList.sort(key = lambda x: x.cosine, reverse=True) print "Showing results based on cosine similarity: " for r in resList: print print r.rank print r.title print r.url print r.snippet print