def search(query, scorer, index): """ Retrieve documents matching a query using the specified scorer. 1) Tokenize the query. 2) Convert the query tokens to a vector, using Index.query_to_vector. 3) Call the scorer's score function. 4) Return the list of document ids in descending order of relevance. NB: Due to the inconsistency of floating point arithmetic, when sorting, round the scores to 6 decimal places (e.g., round(x, 6)). This will ensure replicable results. Params: query....A string representing a search query. scorer...A ScoringFunction to retrieve documents. index....A Index storing postings lists. Returns: A list of document ids in descending order of relevance to the query. """ query_tokened = index.tokenize(query) query_vector = index.query_to_vector(query_tokened) score = scorer.score(query_vector, index) sortedlist = sorted(score.items(), key=lambda x: round(x[1], 5), reverse=True) return [i[0] - 1 for i in sortedlist]
def search(query, scorer, index): """ Retrieve documents matching a query using the specified scorer. 1) Tokenize the query. 2) Convert the query tokens to a vector, using Index.query_to_vector. 3) Call the scorer's score function. 4) Return the list of document ids in descending order of relevance. NB: Due to the inconsistency of floating point arithmetic, when sorting, round the scores to 6 decimal places (e.g., round(x, 6)). This will ensure replicable results. Params: query....A string representing a search query. scorer...A ScoringFunction to retrieve documents. index....A Index storing postings lists. Returns: A list of document ids in descending order of relevance to the query. """ tokenized = index.tokenize(query) vector = index.query_to_vector(tokenized) tempResult = scorer.score(vector,index) tempResult = sorted(tempResult.items(), key=lambda key: round(key[1], 6), reverse=True) result = [] for i in range(0, len(tempResult)): result.append(tempResult[i][0]) return result
def run(): indexer = TinyIndexer(INDEX_PATH, NUM_PAGES, PAGE_SIZE) indexer.create_if_not_exists() nlp = English() for path in glob(CRAWL_GLOB): print("Path", path) with gzip.open(path, 'rt') as html_file: url = html_file.readline().strip() content = html_file.read() if indexer.document_indexed(url): print("Page exists, skipping", url) continue cleaned_text = clean(content) try: title = bs4.BeautifulSoup(content, features="lxml").find('title').string except AttributeError: title = cleaned_text[:80] tokens = tokenize(nlp, cleaned_text) print("URL", url) print("Tokens", tokens) print("Title", title) indexer.index(tokens, url, title)
def search(query, scorer, index): """ Retrieve documents matching a query using the specified scorer. 1) Tokenize the query. 2) Convert the query tokens to a vector, using Index.query_to_vector. 3) Call the scorer's score function. 4) Return the list of document ids in descending order of relevance. NB: Due to the inconsistency of floating point arithmetic, when sorting, round the scores to 6 decimal places (e.g., round(x, 6)). This will ensure replicable results. Params: query....A string representing a search query. scorer...A ScoringFunction to retrieve documents. index....A Index storing postings lists. Returns: A list of document ids in descending order of relevance to the query. """ ###TODO tokenized_query = index.tokenize(query) q_vector = index.query_to_vector(tokenized_query) doc_score = scorer.score(q_vector,index) return sorted(doc_score, key=lambda k: doc_score[k],reverse=True)
def search(query, scorer, index): """ Retrieve documents matching a query using the specified scorer. 1) Tokenize the query. 2) Convert the query tokens to a vector, using Index.query_to_vector. 3) Call the scorer's score function. 4) Return the list of document ids in descending order of relevance. NB: Due to the inconsistency of floating point arithmetic, when sorting, round the scores to 6 decimal places (e.g., round(x, 6)). This will ensure replicable results. Params: query....A string representing a search query. scorer...A ScoringFunction to retrieve documents. index....A Index storing postings lists. Returns: A list of document ids in descending order of relevance to the query. """ qry_terms = index.tokenize(query) qry_vector = index.query_to_vector(qry_terms) document_rank = scorer.score(qry_vector,index) doc_list = defaultdict(lambda: 0) for key in document_rank.keys(): doc_list[key] = round(document_rank[key],6) sorted_list = sorted(doc_list.items(), key = lambda x:x[1], reverse=True) result = [value[0] for value in sorted_list] return result
def search(query, scorer, index): """ Retrieve documents matching a query using the specified scorer. 1) Tokenize the query. 2) Convert the query tokens to a vector, using Index.query_to_vector. 3) Call the scorer's score function. 4) Return the list of document ids in descending order of relevance. NB: Due to the inconsistency of floating point arithmetic, when sorting, round the scores to 6 decimal places (e.g., round(x, 6)). This will ensure replicable results. Params: query....A string representing a search query. scorer...A ScoringFunction to retrieve documents. index....A Index storing postings lists. Returns: A list of document ids in descending order of relevance to the query. """ ###TODO scores = {} tokens = index.tokenize(query) res_vector = index.query_to_vector(tokens) scores = scorer.score(res_vector,index) return sorted(scores, key=scores.__getitem__, reverse=True) pass
def runPNAClassifier(self, outerLimit, innerLimit, query): tagDistances = {} topTags = [] #Calculate tf-idf vector of query based off of idf vector in database queryDict = {} queryDict['tokens'] = self.tfidf.get_doc_keywords_dict(index.tokenize(query)) for centroid in self.centroids: #Calculate distance of query to centroid #Save distance into tagDistance array tagDistances[centroid['tag']] = {'tag':centroid['tag'],'distance':calculateDistance(centroid,queryDict)} topTags = sorted(tagDistances, key= lambda X : tagDistances[X]['distance'])[:outerLimit] print "initial:" + str(topTags) #Loop throught the top tags(outerLimit is the cut off for this) #We will now get the knn neighbors of all the questions inside of the outerLimit ''' loadedQuestions = [] questionDistances = {} for tag in topTags: for question in self.corpus: if tag in question['tags']: questionDict = {} questionDict['tokens'] = self.tfidf.get_doc_keywords_dict(question['body']) questionDict['tags'] = question['tags'] loadedQuestions.append(questionDict) for question in loadedQuestions: distance = calculateDistance(question,queryDict) for tag in question['tags']: if tag in topTags: if tag in questionDistances: questionDistances[tag]['distance'] += distance questionDistances[tag]['count'] += 1 else: questionDistances[tag] = {} questionDistances[tag]['distance'] = distance questionDistances[tag]['tag'] = tag questionDistances[tag]['count'] = 1 for tag in questionDistances: questionDistances[tag]['distance'] = (questionDistances[tag]['distance'] * 1.0) / (questionDistances[tag]['count'] * 1.0) topTags = sorted(questionDistances, key= lambda X : questionDistances[X]['distance'])[:innerLimit] print topTags ''' rvalue = {} rvalue['tags'] = [] for tag in topTags: rvalue['tags'].append(str(tag)) return rvalue
def search(query, scorer, index): """ Retrieve documents matching a query using the specified scorer. 1) Tokenize the query. 2) Convert the query tokens to a vector, using Index.query_to_vector. 3) Call the scorer's score function. 4) Return the list of document ids in descending order of relevance. NB: Due to the inconsistency of floating point arithmetic, when sorting, round the scores to 6 decimal places (e.g., round(x, 6)). This will ensure replicable results. Params: query....A string representing a search query. scorer...A ScoringFunction to retrieve documents. index....A Index storing postings lists. Returns: A list of document ids in descending order of relevance to the query. """ # 1. Tokenize query_tokens = index.tokenize(query) # 2. Query tokens --> query Vectors query_vector = index.query_to_vector(query_tokens) # 3. Scorer Functions... scores_dict = scorer.score(query_vector, index) temp_dict = defaultdict(lambda:0.0) for key, values in scores_dict.items(): temp_dict[key] = round(values,6) lists = sorted(temp_dict.items(), key=lambda k:k[1], reverse=True) new_list = [] for temp in lists: new_list.append(int(temp[0])) return new_list