示例#1
0
            logging.debug('Found 0 unique terms, skipping %s' % (
                parse_data['meta']['name']
            ))
            return

        # Add the language and file data to the unique terms list
        # We dont need to check for these terms being unique because this
        # is the only way they can contained in this context
        unique_terms.extend([
            [parse_data['meta']['name'], context_id_map['file']],
            [parse_data['meta']['language'], context_id_map['language']]
        ])
        
        # Construct our document datastructure with its document tree
        # and embed it properly!
        tree_id = TreeModel.insert(tree)
        doc = dict(
            tree_id=tree_id,
            signals=parse_data['tree'].signals,
            meta=parse_data['meta'],
        )
        doc_id = DocumentModel.insert(doc)

        # Record our TermID -> DocID mapping so we can atomically update
        # the database later with these pointers in the final phase.
        for term, context in unique_terms:
            if term not in self.terms:
                self.terms[term] = []

            self.terms[term].append([doc_id, context])
示例#2
0
    def search (self, query, page=1, results_per_page=10):
        
        """
        Returns a list of documents with their corresponding scores and
        summaries that are specific to the givne query.

        The data is returned in the following format:
            (num_results, [(score, summary, document), ....,])

        The input parameters should be as follows
            query               The query string
            page                The page number, must be a positive 
                                non-zero integer
            results_per_page    The number of reuslts per page, non-zero
                                positive integer
        """

        # Take query string and translate it into the format we require for
        # fast processing which is [(context_id, term_id), ...] 
        tokens = query_translate(query)

        # Retrieve the relevant documents for our given query tokens and
        # their associated classifications.
        start_time = time.time()
        docs = self._get_documents(tokens)
        logging.debug('%.2f seconds to retrieve and sort %d relevant docs' % (
            time.time() - start_time,
            len(docs)
        ))
        
        # Remove duplicate documents from our results before we even bother
        # scoring them or doing anytihng else.
        docs = self._remove_duplicates(docs)

        # Now that we have the matching documents we can now calculate the
        # total score matching our query and then sort them form highest to
        # lowest.
        calculator = Calculator(tokens)
        results = sorted([(calculator(doc), doc) for doc in docs],
            key=itemgetter(0),
            reverse=True
        )

        # Now segment our results into the section we will use and then
        # retrieve their document trees for the summarization process. We
        # only want to summarize the range of documents for the specific
        # page we are rendering.
        results = results[(page-1)*results_per_page:page*results_per_page]
        trees = TreeModel.get([doc['tree_id'] for score, doc in results])
        for score, doc in results:
            doc['tree'] = trees[doc['tree_id']]

        summarize = Summarizer(tokens)
        start_time = time.time()
        results = [
            (score, summarize(document['tree']), document) \
            for score, document in results
        ]
        logging.debug('%.4f seconds to summarize and return results' % (
            time.time() - start_time
        ))
        
        return len(docs), results