class Engine(object): def __init__(self, fd, fp): self.dictionary = Dictionary(fd, load=True) self.postings = Postings(fp, mode='r') def _get_postings(self, termInfo): if termInfo[-1] is not None: return self.postings.list_at_offset(termInfo[-1]) return None def execute_query(self, reverse_polish): args = [] while reverse_polish: token = reverse_polish.popleft() if not isinstance(token, Operator): dterm = self.dictionary.term(token) postings_list = self._get_postings(dterm) args.append(postings_list) else: if isinstance(token, NOTOperator): args.append(self.postings.not_list()) # print '\nExecuting ', token, ' for args: ', str(args), '\n' for i in range(len(args)): if args[i] is not None and args[i]._entries_len == 0: args[i] = None splitpoint = -1 * token.nargs o_args = args[splitpoint:] args = args[:splitpoint] + [token.execute(o_args)] return args[-1]
class Engine(object): NUM_RESULTS = 10 def __init__(self, fd, fp): self.dictionary = Dictionary(fd, load=True) self.postings = Postings(fp, mode='r') def _get_postings(self, offset): return self.postings.list_at_offset(offset) def _accumulate_scores(self, scores, postings_list, q_wt): for doc_id, d_tf in postings_list: scores[doc_id] = scores.get(doc_id, 0) + q_wt * d_tf def _normalize(self, scores, q_len): for doc_id in scores: scores[doc_id] /= (q_len * self.dictionary.doc_length(doc_id)) def _get_top_n_docs(self, scores, n): scores_heap = [(-v, k) for k, v in scores.items()] heapq.heapify(scores_heap) return [ heapq.heappop(scores_heap)[1] for i in xrange(n) if len(scores_heap) > 0 ] def execute_query(self, query_map): scores = {} for term in query_map: q_idf, term_offset = self.dictionary.term(term) # unknown term, skip everything, score 0 if term_offset is None: continue # accumulate scores for postings list query_map[term] = q_wt = tf(query_map[term]) * q_idf postings_list = self._get_postings(term_offset) self._accumulate_scores(scores, postings_list, q_wt) # perform length normalization (query and document) q_len = math.sqrt(sum(x * x for x in query_map.values())) self._normalize(scores, q_len) # find top n top_n_docs = self._get_top_n_docs(scores, Engine.NUM_RESULTS) return " ".join(str(x) for x in top_n_docs)
class Engine(object): """ Search engine that uses a simple vector space model to retrieve patents """ NUM_RESULTS = 500 def __init__(self, fd, fp): self.dictionary = Dictionary(fd, load=True) self.postings = Postings(fp, mode='r') def _get_postings(self, offset): """ This method gets the postings list at an offset """ return self.postings.list_at_offset(offset) def _accumulate_scores(self, scores, postings_list, q_wt): """ This method accumulates scores for a term """ for doc_id, d_tf in postings_list: scores[doc_id] = scores.get(doc_id, 0) + q_wt * d_tf def _normalize(self, scores, q_len): """ This method normalises scores for every document """ for doc_id in scores: scores[doc_id] /= (q_len * self.dictionary.doc_length(doc_id)) def _get_top_n_docs(self, scores, n): """ This method creates a heap of the docs and pick out the top few """ scores_heap = [(-v, k) for k, v in scores.items()] heapq.heapify(scores_heap) return [heapq.heappop(scores_heap)[1] for i in xrange(n) if len(scores_heap) > 0] def execute_query(self, query_map): """ This method is called to execute a query """ scores = {} for term in query_map: q_idf, term_offset = self.dictionary.term(term) # unknown term, skip everything, score 0 if term_offset is None: continue # accumulate scores for postings list query_map[term] = q_wt = tf(query_map[term]) * q_idf postings_list = self._get_postings(term_offset) self._accumulate_scores(scores, postings_list, q_wt) # perform length normalization (query and document) q_len = math.sqrt(sum(x * x for x in query_map.values())) self._normalize(scores, q_len) # find top n # top_n_docs = self._get_top_n_docs(scores, Engine.NUM_RESULTS) # return " ".join(str(x) for x in top_n_docs) return " ".join(str(x) for x in scores.keys())
class feedbackEngine(object): """ Search engine that uses relevance feedback with a vector space model to retrieve patents """ global NUM_RESULTS global QUERY_WEIGHT global P_FEEDBACK_WEIGHT NUM_RESULTS = 10 QUERY_WEIGHT = 0.5 P_FEEDBACK_WEIGHT = 0.5 def __init__(self, fd, fp): self.dictionary = Dictionary(fd, load=True) self.postings = Postings(fp, mode='r') self.feedback = False def _get_postings(self, offset): """ This method gets the postings list at an offset """ return self.postings.list_at_offset(offset) def _accumulate_scores(self, scores, postings_list, q_wt): """ This method accumulates scores for a term """ for doc_id, d_tf in postings_list: scores[doc_id] = scores.get(doc_id, 0) + q_wt * d_tf def _normalize(self, scores, q_len): """ This method normalises scores for every document """ for doc_id in scores: scores[doc_id] /= (q_len * self.dictionary.doc_length(doc_id)) def _get_top_n_docs(self, scores, n): """ This method creates a heap of the docs and pick out the top few """ scores_heap = [(-v, k) for k, v in scores.items()] heapq.heapify(scores_heap) return [heapq.heappop(scores_heap)[1] for i in xrange(n) if len(scores_heap) > 0] def relevance_feedback(self, query_map, top_n_docs): """ This method expands the query based on pseudo relevance feedback """ self.feedback = True vector_sum = {} term_dict = self.dictionary._terms # constructing the document vector for the top n docs for term in term_dict: term_offset = term_dict[term][1] # unknown term, skip everything, score 0 if term_offset is None or term is None: continue # adding the term frequencies of all the documents in top_n_docs postings_list = self._get_postings(term_offset) for doc_id, d_tf in postings_list: if doc_id in top_n_docs: temp_term_freq = d_tf*P_FEEDBACK_WEIGHT if term in vector_sum: vector_sum[term] += temp_term_freq else: vector_sum[term] = temp_term_freq # averaging the vector for the top docs to get the centroid for term in vector_sum: vector_sum[term] /= NUM_RESULTS vector_sum[term] *= P_FEEDBACK_WEIGHT # adding the initial query vector terms to the centroid for term in vector_sum: if term in query_map: vector_sum[term] += query_map[term] * QUERY_WEIGHT # adding the remaining terms left in the query vector for term in query_map: if term not in vector_sum: vector_sum[term] = query_map[term] * QUERY_WEIGHT # execute query with the new query vector return self.execute_query(vector_sum) def execute_query(self, query_map): """ This method is called to execute a query """ scores = {} query_map_copy = copy.deepcopy(query_map) for term in query_map: q_idf, term_offset = self.dictionary.term(term) # unknown term, skip everything, score 0 if term_offset is None: continue # accumulate scores for postings list query_map[term] = q_wt = tf(query_map[term]) * q_idf postings_list = self._get_postings(term_offset) self._accumulate_scores(scores, postings_list, q_wt) # perform length normalization (query and document) q_len = math.sqrt(sum(x * x for x in query_map.values())) self._normalize(scores, q_len) # if havent done relevance feedback, do relevance feedback if not self.feedback: top_n_docs = self._get_top_n_docs(scores, Engine.NUM_RESULTS) stringout = self.relevance_feedback(query_map_copy, top_n_docs) # if here, calling from within relevance feedback else: # return the output of all the scores after relevance feedback stringout = " ".join(str(x) for x in scores.keys()) return stringout