logging.debug('Found 0 unique terms, skipping %s' % ( parse_data['meta']['name'] )) return # Add the language and file data to the unique terms list # We dont need to check for these terms being unique because this # is the only way they can contained in this context unique_terms.extend([ [parse_data['meta']['name'], context_id_map['file']], [parse_data['meta']['language'], context_id_map['language']] ]) # Construct our document datastructure with its document tree # and embed it properly! tree_id = TreeModel.insert(tree) doc = dict( tree_id=tree_id, signals=parse_data['tree'].signals, meta=parse_data['meta'], ) doc_id = DocumentModel.insert(doc) # Record our TermID -> DocID mapping so we can atomically update # the database later with these pointers in the final phase. for term, context in unique_terms: if term not in self.terms: self.terms[term] = [] self.terms[term].append([doc_id, context])
def search (self, query, page=1, results_per_page=10): """ Returns a list of documents with their corresponding scores and summaries that are specific to the givne query. The data is returned in the following format: (num_results, [(score, summary, document), ....,]) The input parameters should be as follows query The query string page The page number, must be a positive non-zero integer results_per_page The number of reuslts per page, non-zero positive integer """ # Take query string and translate it into the format we require for # fast processing which is [(context_id, term_id), ...] tokens = query_translate(query) # Retrieve the relevant documents for our given query tokens and # their associated classifications. start_time = time.time() docs = self._get_documents(tokens) logging.debug('%.2f seconds to retrieve and sort %d relevant docs' % ( time.time() - start_time, len(docs) )) # Remove duplicate documents from our results before we even bother # scoring them or doing anytihng else. docs = self._remove_duplicates(docs) # Now that we have the matching documents we can now calculate the # total score matching our query and then sort them form highest to # lowest. calculator = Calculator(tokens) results = sorted([(calculator(doc), doc) for doc in docs], key=itemgetter(0), reverse=True ) # Now segment our results into the section we will use and then # retrieve their document trees for the summarization process. We # only want to summarize the range of documents for the specific # page we are rendering. results = results[(page-1)*results_per_page:page*results_per_page] trees = TreeModel.get([doc['tree_id'] for score, doc in results]) for score, doc in results: doc['tree'] = trees[doc['tree_id']] summarize = Summarizer(tokens) start_time = time.time() results = [ (score, summarize(document['tree']), document) \ for score, document in results ] logging.debug('%.4f seconds to summarize and return results' % ( time.time() - start_time )) return len(docs), results