def _get_documents (self, tokens): """ Returns a list of unique documents that are sorted by their score from highest to lowest. """ # Transform our list of token pairs (context_id, term_id) into a # map of term_id -> context_id. We do this for optimizations # further down the road. token_map = dict() for context_id, term_id in tokens: token_map[term_id] = context_id # Retrieve our dictionary of Term -> [(doc,context), ...] from the # database start_time = time.time() term_doc_map = TermModel.get_term_doc_map(token_map.keys()) logging.debug('Took %.4fs to retrieve data structure for %d terms' % ( time.time() - start_time, len(term_doc_map) )) # Once we have our dictionary mapping we need to group based on # document id's and their terms. We will build a dictionary mapping # document id to a list of terms and the context in which those # terms occur. map[DOC_ID] = [[context_id, term_id], ...] # # At this point we will also remove all document ids if given # contexts are specified in our token list. This will handle the # case were a user has specified that a term must occur in a # certain context. start_time = time.time() doc_term_map = self._organize(token_map, term_doc_map) logging.debug('Took %.4fs to rearrange data into structure for %d docs' % ( time.time() - start_time, len(doc_term_map) )) # Recall document data for each document relevant to our query, # then we build a map structure for mapping doc_id -> doc_data start_time = time.time() docs = self._retrieve_documents(doc_term_map) logging.debug('Took %.4fs to retrieve %d document data from database' % ( time.time() - start_time, len(docs) )) return docs