예제 #1
0
class WhooshTrecNewsRedis(Engine):
    """
    A revised Whoosh ifind engine.
    Implemented by dmax. Uses a new way of poking the postings file by @leifos, and also some tasty Redis caching.
    """
    def __init__(self,
                 whoosh_index_dir='',
                 use_cache=True,
                 cache_host='localhost',
                 cache_port=6379,
                 **kwargs):
        """
        Constructor for the engine.
        """
        Engine.__init__(self, **kwargs)

        self.whoosh_index_dir = whoosh_index_dir
        if not self.whoosh_index_dir:
            raise EngineConnectionException(
                self.name,
                "'whoosh_index_dir=' keyword argument not specified")

        #  Only put PL2 in for now (for more, add the model parameter to the constructor to specify!)
        self.scoring_model_identifier = 1
        self.scoring_model = scoring.PL2(c=10.0)

        try:
            self.doc_index = open_dir(self.whoosh_index_dir)
            self.reader = self.doc_index.reader()
            self.parser = QueryParser(
                'content',
                self.doc_index.schema)  # By default, we use AND grouping.
            # Use the grouping parameter and specify whoosh.qparser.OrGroup, etc...

            #  Objects required for document snippet generation
            self.analyzer = self.doc_index.schema[
                self.parser.fieldname].analyzer
            self.fragmenter = ContextFragmenter(maxchars=200, surround=40)
            self.formatter = HtmlFormatter()
        except EmptyIndexError:
            message = "Could not open Whoosh index at '{0}'".format(
                self.whoosh_index_dir)
            raise EngineConnectionException(self.name, message)
        except OSError:
            message = "Could not open Whoosh index at '{0}' - directory does not exist".format(
                self.whoosh_index_dir)
            raise EngineConnectionException(self.name, message)

        self.use_cache = use_cache
        if self.use_cache:
            self.cache = RedisConn(host=cache_host, port=cache_port)
            self.cache.connect()

            self.page_cache_forward_look = 40  # How many additional pages to cache when required.
            self.page_cache_when = 4  # When the user is x pages away from the end of the page cache, cache more pages.

            self.page_cache_controller = PageCacheController(
                cache_host=self.cache.host,
                cache_port=self.cache.port,
                whoosh_index=self.doc_index,
                scoring_model_identifier=self.scoring_model_identifier,
                parser=self.parser,
                analyzer=self.analyzer,
                fragmenter=self.fragmenter,
                formatter=self.formatter,
                cache_forward_look=self.page_cache_forward_look)

    def _search(self, query):
        """
        The concrete method of the Engine's search interface method, search().
        """
        self.__parse_query_terms(query)

        page_cache_key = get_cache_key(
            model_identifier=self.scoring_model_identifier,
            fieldname=self.parser.fieldname,
            term=query.terms,
            key_type=2,
            page=query.skip)

        if self.use_cache and self.cache.exists(
                page_cache_key
        ):  # If true, we have a page cached - so use this!
            page_results = self.cache.get(page_cache_key)
            highest_cached_page = self.__get_highest_cached_page(query)

            if highest_cached_page - query.skip < self.page_cache_when:  # Do we need to cache some more pages?
                self.__add_to_page_cacher((highest_cached_page + 1), query,
                                          page_results)

            return parse_response(reader=self.reader,
                                  fieldname=self.parser.fieldname,
                                  analyzer=self.analyzer,
                                  fragmenter=self.fragmenter,
                                  formatter=self.formatter,
                                  query=query,
                                  results=page_results,
                                  results_are_page=True)
        else:  # No page is cached, so we get the results for that page - and no doubt cache some more pages.
            return self._request(query)

    def _request(self, query):
        """
        Services a request that does not have a page cached.
        Returns an ifind Response object using information from either the Redis cache of Whoosh.
        """
        query_cache_key = get_cache_key(
            model_identifier=self.scoring_model_identifier,
            fieldname=self.parser.fieldname,
            term=query.terms,
            key_type=1)

        if self.use_cache and self.cache.exists(query_cache_key):
            sorted_results = self.cache.get(query_cache_key)
        else:
            with self.doc_index.searcher(
                    weighting=self.scoring_model) as searcher:
                doc_scores = {}

                if isinstance(query.parsed_terms, unicode):
                    doc_term_scores = self.__get_doc_term_scores(
                        searcher, query.parsed_terms)
                    self.__update_scores(doc_scores, doc_term_scores)
                else:
                    try:
                        for term in query.parsed_terms:
                            doc_term_scores = self.__get_doc_term_scores(
                                searcher, term.text)
                            self.__update_scores(doc_scores, doc_term_scores)
                    except NotImplementedError:
                        pass

            sorted_results = sorted(doc_scores.iteritems(),
                                    key=itemgetter(1),
                                    reverse=True)
            #sorted_results = sorted(set(doc_scores.iteritems()), key=itemgetter(1), reverse=True)

        # This block of code checks if additional page caching is required.
        # This will arise when no pages for the given query are cached, or the user is close to reaching the end of
        # the cached page collection for the given query.
        if self.use_cache:
            self.cache.store(query_cache_key, sorted_results)
            highest_cached_page = self.__get_highest_cached_page(query)

            if highest_cached_page == -1:  # Cache pages from page 1.
                self.__add_to_page_cacher(1, query, sorted_results)
            elif highest_cached_page - query.skip < self.page_cache_when:  # Start caching from page x
                self.__add_to_page_cacher((highest_cached_page + 1), query,
                                          sorted_results)

        return parse_response(reader=self.reader,
                              fieldname=self.parser.fieldname,
                              analyzer=self.analyzer,
                              fragmenter=self.fragmenter,
                              formatter=self.formatter,
                              query=query,
                              results=sorted_results)

    def __parse_query_terms(self, query):
        """
        Parses the query terms provided.
        Creates a Whoosh compound query type in query.parsed_terms if more than one term is specified.
        If only a single term is specified, a unicode string instance is used instead.
        """
        def tidy_terms(self):
            """
            Nested function to remove unwanted query terms (e.g. AND, OR, NOT) from the query.
            Also tidies the query by removing redundant whitespace and newline characters.
            """
            ignore = [
                'and', 'or', 'not', 'in', 'the', 'a', 'to'
            ]  # Terms to be ignored. These are not included in the tidied querystring.
            # Ensure the terms in the list are all lowercase!

            terms = query.terms
            terms = terms.lower()
            terms = terms.strip()
            terms = terms.split()

            query.terms = ""

            for term in terms:
                if term not in ignore:
                    query.terms = "{0} {1}".format(query.terms, term)

            query.terms = query.terms.strip()
            query.terms = unicode(query.terms)

        if not query.top:
            raise QueryParamException(
                self.name,
                "Total number of results (query.top) not specified.")

        if query.top < 1:
            raise QueryParamException(
                self.name, "Page length (query.top) must be at least 1.")

        tidy_terms(query)

        if len(query.terms.split()) == 1:
            query.parsed_terms = unicode(query.terms)
        else:
            query.parsed_terms = self.parser.parse(query.terms)

        query.terms = query.terms.strip()

    def __get_doc_term_scores(self, searcher, term):
        """
        Returns a dictionary object comprised of Whoosh document IDs for keys, and scores as values.
        The scores correspond to how relevant the given document is to the given term, provided as parameter query.
        Parameter term should be a unicode string. The Whoosh searcher instance should be provided as parameter searcher.
        """
        doc_term_scores = {}
        term_cache_key = get_cache_key(
            model_identifier=self.scoring_model_identifier,
            fieldname=self.parser.fieldname,
            term=term,
            key_type=0)

        if self.use_cache and self.cache.exists(term_cache_key):
            return self.cache.get(term_cache_key)  # That was simple!
        else:
            try:
                postings = searcher.postings(self.parser.fieldname, term)

                for i in postings.all_ids():  ## too much time
                    doc_term_scores[i] = postings.score()

            except TermNotFound:  # If the term is not found in the inverted index, do nada.
                pass

        if self.use_cache:  # If caching is enabled, cache the results. If we are here, we need to cache them!
            self.cache.store(term_cache_key, doc_term_scores)

        return doc_term_scores

    def __update_scores(self, doc_scores, doc_term_scores):
        """
        Updates the doc_scores dictionary with the rankings from doc_term_scores.
        The end result is doc_scores will have a cumulative total for each document for each term.
        """
        for i in doc_term_scores:
            if i in doc_scores:
                doc_scores[i] = doc_scores[i] + doc_term_scores[i]
            else:
                doc_scores[i] = doc_term_scores[i]

    def __get_highest_cached_page(self, query):
        """
        For a given query, returns the highest cached page number.
        For example, if pages 1-10 for a given page are cached, 10 would be returned.

        If no pages are cached for the given query, -1 is returned.
        This method assumes that pages are cached in a linear fashion - there are no gaps where pages are not cached.

        If caching is not enabled, -1 is always returned.
        """
        if not self.use_cache:
            return -1

        wildcard_key = '{0}:page:*:{1}:{2}'.format(
            self.scoring_model_identifier, self.parser.fieldname, query.terms)
        matching_keys = self.cache.keys(wildcard_key)
        highest_page = -1

        if len(matching_keys) == 0:
            return -1
        else:
            for key in matching_keys:
                key = key.split(':')
                page = int(key[2])

                if page > highest_page:
                    highest_page = page

        return highest_page

    def __add_to_page_cacher(self, start_page, query, results):
        """
        Adds a page to the queue in the caching thread.
        If the thread is not started (i.e. it died because it got old), a new thread is started.
        """
        if not self.cache:
            return

        if not self.page_cache_controller.is_alive():
            try:
                self.page_cache_controller.start()
            except RuntimeError:
                self.page_cache_controller = PageCacheController(
                    cache_host=self.cache.host,
                    cache_port=self.cache.port,
                    whoosh_index=self.doc_index,
                    scoring_model_identifier=self.scoring_model_identifier,
                    parser=self.parser,
                    analyzer=self.analyzer,
                    fragmenter=self.fragmenter,
                    formatter=self.formatter,
                    cache_forward_look=self.page_cache_forward_look)
                self.page_cache_controller.start()

        # We can only be certain here if the page caching thread is alive - so we can now add to its queue.
        self.page_cache_controller.add(start_page, query, results)
예제 #2
0
class WhooshTrecNewsRedis(Engine):
    """
    A revised Whoosh ifind engine.
    Implemented by dmax. Uses a new way of poking the postings file by @leifos, and also some tasty Redis caching.
    """

    def __init__(self, whoosh_index_dir="", use_cache=True, cache_host="localhost", cache_port=6379, **kwargs):
        """
        Constructor for the engine.
        """
        Engine.__init__(self, **kwargs)

        self.whoosh_index_dir = whoosh_index_dir
        if not self.whoosh_index_dir:
            raise EngineConnectionException(self.name, "'whoosh_index_dir=' keyword argument not specified")

        #  Only put PL2 in for now (for more, add the model parameter to the constructor to specify!)
        self.scoring_model_identifier = 1
        self.scoring_model = scoring.PL2(c=10.0)

        try:
            self.doc_index = open_dir(self.whoosh_index_dir)
            self.reader = self.doc_index.reader()

            self.parser = QueryParser("content", self.doc_index.schema)  # By default, we use AND grouping.
            # Use the grouping parameter and specify whoosh.qparser.OrGroup, etc...

            #  Objects required for document snippet generation
            self.analyzer = self.doc_index.schema[self.parser.fieldname].analyzer
            self.fragmenter = ContextFragmenter(maxchars=200, surround=40)
            self.formatter = HtmlFormatter()
        except EmptyIndexError:
            message = "Could not open Whoosh index at '{0}'".format(self.whoosh_index_dir)
            raise EngineConnectionException(self.name, message)
        except OSError:
            message = "Could not open Whoosh index at '{0}' - directory does not exist".format(self.whoosh_index_dir)
            raise EngineConnectionException(self.name, message)

        self.use_cache = use_cache
        if self.use_cache:
            self.cache = RedisConn(host=cache_host, port=cache_port)
            self.cache.connect()

            self.page_cache_forward_look = 40  # How many additional pages to cache when required.
            self.page_cache_when = 4  # When the user is x pages away from the end of the page cache, cache more pages.

            self.page_cache_controller = PageCacheController(
                cache_host=self.cache.host,
                cache_port=self.cache.port,
                whoosh_index=self.doc_index,
                scoring_model_identifier=self.scoring_model_identifier,
                parser=self.parser,
                analyzer=self.analyzer,
                fragmenter=self.fragmenter,
                formatter=self.formatter,
                cache_forward_look=self.page_cache_forward_look,
            )

    def _search(self, query):
        """
        The concrete method of the Engine's search interface method, search().
        """
        self.__parse_query_terms(query)

        page_cache_key = get_cache_key(
            model_identifier=self.scoring_model_identifier,
            fieldname=self.parser.fieldname,
            term=query.terms,
            key_type=2,
            page=query.skip,
        )

        if self.use_cache and self.cache.exists(page_cache_key):  # If true, we have a page cached - so use this!
            page_results = self.cache.get(page_cache_key)
            highest_cached_page = self.__get_highest_cached_page(query)

            if highest_cached_page - query.skip < self.page_cache_when:  # Do we need to cache some more pages?
                self.__add_to_page_cacher((highest_cached_page + 1), query, page_results)

            return parse_response(
                reader=self.reader,
                fieldname=self.parser.fieldname,
                analyzer=self.analyzer,
                fragmenter=self.fragmenter,
                formatter=self.formatter,
                query=query,
                results=page_results,
                results_are_page=True,
            )
        else:  # No page is cached, so we get the results for that page - and no doubt cache some more pages.
            return self._request(query)

    def _request(self, query):
        """
        Services a request that does not have a page cached.
        Returns an ifind Response object using information from either the Redis cache of Whoosh.
        """
        query_cache_key = get_cache_key(
            model_identifier=self.scoring_model_identifier,
            fieldname=self.parser.fieldname,
            term=query.terms,
            key_type=1,
        )

        if self.use_cache and self.cache.exists(query_cache_key):
            sorted_results = self.cache.get(query_cache_key)
        else:
            with self.doc_index.searcher(weighting=self.scoring_model) as searcher:
                doc_scores = {}

                if isinstance(query.parsed_terms, unicode):
                    doc_term_scores = self.__get_doc_term_scores(searcher, query.parsed_terms)
                    self.__update_scores(doc_scores, doc_term_scores)
                else:
                    try:
                        for term in query.parsed_terms:
                            doc_term_scores = self.__get_doc_term_scores(searcher, term.text)
                            self.__update_scores(doc_scores, doc_term_scores)
                    except NotImplementedError:
                        pass

            sorted_results = sorted(doc_scores.iteritems(), key=itemgetter(1), reverse=True)

        # This block of code checks if additional page caching is required.
        # This will arise when no pages for the given query are cached, or the user is close to reaching the end of
        # the cached page collection for the given query.
        if self.use_cache:
            self.cache.store(query_cache_key, sorted_results)
            highest_cached_page = self.__get_highest_cached_page(query)

            if highest_cached_page == -1:  # Cache pages from page 1.
                self.__add_to_page_cacher(1, query, sorted_results)
            elif highest_cached_page - query.skip < self.page_cache_when:  # Start caching from page x
                self.__add_to_page_cacher((highest_cached_page + 1), query, sorted_results)

        return parse_response(
            reader=self.reader,
            fieldname=self.parser.fieldname,
            analyzer=self.analyzer,
            fragmenter=self.fragmenter,
            formatter=self.formatter,
            query=query,
            results=sorted_results,
        )

    def __parse_query_terms(self, query):
        """
        Parses the query terms provided.
        Creates a Whoosh compound query type in query.parsed_terms if more than one term is specified.
        If only a single term is specified, a unicode string instance is used instead.
        """

        def tidy_terms(self):
            """
            Nested function to remove unwanted query terms (e.g. AND, OR, NOT) from the query.
            Also tidies the query by removing redundant whitespace and newline characters.
            """
            ignore = [
                "and",
                "or",
                "not",
                "in",
                "the",
                "a",
                "to",
            ]  # Terms to be ignored. These are not included in the tidied querystring.
            # Ensure the terms in the list are all lowercase!

            terms = query.terms
            terms = terms.lower()
            terms = terms.strip()
            terms = terms.split()

            query.terms = ""

            for term in terms:
                if term not in ignore:
                    query.terms = "{0} {1}".format(query.terms, term)

            query.terms = query.terms.strip()
            query.terms = unicode(query.terms)

        if not query.top:
            raise QueryParamException(self.name, "Total number of results (query.top) not specified.")

        if query.top < 1:
            raise QueryParamException(self.name, "Page length (query.top) must be at least 1.")

        tidy_terms(query)

        if len(query.terms.split()) == 1:
            query.parsed_terms = unicode(query.terms)
        else:
            query.parsed_terms = self.parser.parse(query.terms)

        query.terms = query.terms.strip()

    def __get_doc_term_scores(self, searcher, term):
        """
        Returns a dictionary object comprised of Whoosh document IDs for keys, and scores as values.
        The scores correspond to how relevant the given document is to the given term, provided as parameter query.
        Parameter term should be a unicode string. The Whoosh searcher instance should be provided as parameter searcher.
        """
        doc_term_scores = {}
        term_cache_key = get_cache_key(
            model_identifier=self.scoring_model_identifier, fieldname=self.parser.fieldname, term=term, key_type=0
        )

        if self.use_cache and self.cache.exists(term_cache_key):
            return self.cache.get(term_cache_key)  # That was simple!
        else:
            try:
                postings = searcher.postings(self.parser.fieldname, term)

                for i in postings.all_ids():
                    doc_term_scores[i] = postings.score()

            except TermNotFound:  # If the term is not found in the inverted index, do nada.
                pass

        if self.use_cache:  # If caching is enabled, cache the results. If we are here, we need to cache them!
            self.cache.store(term_cache_key, doc_term_scores)

        return doc_term_scores

    def __update_scores(self, doc_scores, doc_term_scores):
        """
        Updates the doc_scores dictionary with the rankings from doc_term_scores.
        The end result is doc_scores will have a cumulative total for each document for each term.
        """
        for i in doc_term_scores:
            if i in doc_scores:
                doc_scores[i] = doc_scores[i] + doc_term_scores[i]
            else:
                doc_scores[i] = doc_term_scores[i]

    def __get_highest_cached_page(self, query):
        """
        For a given query, returns the highest cached page number.
        For example, if pages 1-10 for a given page are cached, 10 would be returned.

        If no pages are cached for the given query, -1 is returned.
        This method assumes that pages are cached in a linear fashion - there are no gaps where pages are not cached.

        If caching is not enabled, -1 is always returned.
        """
        if not self.use_cache:
            return -1

        wildcard_key = "{0}:page:*:{1}:{2}".format(self.scoring_model_identifier, self.parser.fieldname, query.terms)
        matching_keys = self.cache.keys(wildcard_key)
        highest_page = -1

        if len(matching_keys) == 0:
            return -1
        else:
            for key in matching_keys:
                key = key.split(":")
                page = int(key[2])

                if page > highest_page:
                    highest_page = page

        return highest_page

    def __add_to_page_cacher(self, start_page, query, results):
        """
        Adds a page to the queue in the caching thread.
        If the thread is not started (i.e. it died because it got old), a new thread is started.
        """
        if not self.cache:
            return

        if not self.page_cache_controller.is_alive():
            try:
                self.page_cache_controller.start()
            except RuntimeError:
                self.page_cache_controller = PageCacheController(
                    cache_host=self.cache.host,
                    cache_port=self.cache.port,
                    whoosh_index=self.doc_index,
                    scoring_model_identifier=self.scoring_model_identifier,
                    parser=self.parser,
                    analyzer=self.analyzer,
                    fragmenter=self.fragmenter,
                    formatter=self.formatter,
                    cache_forward_look=self.page_cache_forward_look,
                )
                self.page_cache_controller.start()

        # We can only be certain here if the page caching thread is alive - so we can now add to its queue.
        self.page_cache_controller.add(start_page, query, results)
예제 #3
0
		#  To avoid this, check if the query returned is unicode - if it is, there's one term only - if not, there's >1 term.
		if isinstance(whoosh_query, unicode):
			doc_term_scores = {}
			try:
				postings = searcher.postings(field_name, whoosh_query)
				for i in postings.all_ids():
					doc_term_scores[i] = postings.score()
			except:
				pass
			update_scores(doc_scores, doc_term_scores)
		else:
			for term in whoosh_query:
				doc_term_scores = {}
				key =  make_key(term.fieldname,term.text) 
				print rc
				if rc.exists(key):
					doc_term_scores = get_postings_from_redis(rc, term.fieldname, term.text)
				else:
					try:
						postings = searcher.postings(term.fieldname, term.text)
						for i in postings.all_ids():
							doc_term_scores[i] = postings.score()
					except:
						pass
						
					set_postings_to_redis(rc, term.fieldname,term.text, doc_term_scores)
					
				update_scores(doc_scores, doc_term_scores)				   
		
		results = []
		n = len(doc_scores)
예제 #4
0
class WhooshTrecNewsRedis(Engine):
    """
    A revised Whoosh ifind engine.
    Implemented by dmax. Uses a new way of poking the postings file by @leifos, and also some tasty Redis caching.
    """
    def __init__(self, whoosh_index_dir='', stopwords_file='', cache_host='localhost', cache_port=6379, **kwargs):
        Engine.__init__(self, **kwargs)

        self.whoosh_index_dir = whoosh_index_dir
        if not self.whoosh_index_dir:
            raise EngineConnectionException(self.name, "'whoosh_index_dir=' keyword argument not specified")

        self.stopwords_file = stopwords_file
        if self.stopwords_file:
            self.stopwords = ListReader(self.stopwords_file)  # Open the stopwords file, read into a ListReader
        else:
            raise EngineConnectionException(self.name, "'stopwords_file=' keyword argument not specified")

        self.scoring_model_identifier = 1
        self.scoring_model = scoring.PL2(c=10.0)
        
        self.__verbose = False

        try:
            self.doc_index = open_dir(self.whoosh_index_dir)
            self.reader = self.doc_index.reader()
            self.parser = QueryParser('content', self.doc_index.schema)  # By default, we use AND grouping.
                                                                         # Use the grouping parameter and specify whoosh.qparser.OrGroup, etc...

            #  Objects required for document snippet generation
            self.analyzer = self.doc_index.schema[self.parser.fieldname].analyzer
            self.fragmenter = ContextFragmenter(maxchars=200, surround=40)
            self.formatter = HtmlFormatter()
        except EmptyIndexError:
            message = "Could not open Whoosh index at '{0}'".format(self.whoosh_index_dir)
            raise EngineConnectionException(self.name, message)
        except OSError:
            message = "Could not open Whoosh index at '{0}' - directory does not exist".format(self.whoosh_index_dir)
            raise EngineConnectionException(self.name, message)

        # Attempt to connect to the specified Redis cache.
        self.cache = RedisConn(host=cache_host, port=cache_port)
        self.cache.connect()

    def _search(self, query):
        """
        The concrete search method.
        """
        self.__parse_query_terms(query)

        with self.doc_index.searcher(weighting=self.scoring_model) as searcher:
            doc_scores = {}

            if isinstance(query.parsed_terms, unicode):
                t = time.time()
                doc_term_scores = self.__get_doc_term_scores(searcher, query.parsed_terms)
                t = time.time() - t
                
                if self.__verbose:
                    print "  > Retrieve results for '{0}': {1}".format(query.parsed_terms, t)

                t = time.time()
                self.__update_scores(doc_scores, doc_term_scores)
                t = time.time() - t
                
                if self.__verbose:
                    print "  >> Time to update scores: {0}".format(t)
            else:
                try:
                    for term in query.parsed_terms:
                        t = time.time()
                        doc_term_scores = self.__get_doc_term_scores(searcher, term.text)
                        t = time.time() - t
                        
                        if self.__verbose:
                            print "  > Retrieve results for '{0}': {1}".format(term, t)

                        t = time.time()
                        self.__update_scores(doc_scores, doc_term_scores)
                        t = time.time() - t
                        
                        if self.__verbose:
                            print "  >> Time to update scores: {0}".format(t)
                except NotImplementedError:
                    pass

        t = time.time()
        sorted_results = sorted(doc_scores.iteritems(), key=itemgetter(1), reverse=True)
        t = time.time() - t
        
        if self.__verbose:
            print "  > Time to sort results: {0}".format(t)

        return parse_response(reader=self.reader,
                              fieldname=self.parser.fieldname,
                              analyzer=self.analyzer,
                              fragmenter=self.fragmenter,
                              formatter=self.formatter,
                              query=query,
                              results=sorted_results)

    def __get_doc_term_scores(self, searcher, term):
        """
        Returns a dictionary object comprised of Whoosh document IDs for keys, and scores as values.
        The scores correspond to how relevant the given document is to the given term, provided as parameter query.
        Parameter term should be a unicode string. The Whoosh searcher instance should be provided as parameter searcher.
        """
        doc_term_scores = {}
        term_cache_key = get_cache_key(model_identifier=self.scoring_model_identifier,
                                       fieldname=self.parser.fieldname,
                                       term=term)

        if self.cache.exists(term_cache_key):
            if self.__verbose:
                print "  >> Results are cached"
            return self.cache.get(term_cache_key)  # Easy peasy, return the object from the cache.
        else:
            if self.__verbose:
                print "  >> Results not cached"
            try:
                postings = searcher.postings(self.parser.fieldname, term)

                for i in postings.all_ids():  # THIS IS SLOW, LEIF HAS AN ALGORITHM TO SPEED THIS UP?
                    doc_term_scores[i] = postings.score()
            except TermNotFound:
                pass

            self.cache.store(term_cache_key, doc_term_scores)

        return doc_term_scores

    def __parse_query_terms(self, query):
        """
        Using the stopwords list provided, parses the query object and prepares it for being sent to the engine.
        """

        if not query.top:
            raise QueryParamException(self.name, "Total number of results (query.top) not specified.")

        if query.top < 1:
            raise QueryParamException(self.name, "Page length (query.top) must be at least 1.")

        # Tidy up the querystring. Split it into individual terms so we can process them.
        terms = query.terms
        terms = terms.lower()
        terms = terms.strip()
        terms = terms.split()  # Chop!

        query.terms = ""  # Reset the query's terms string to a blank string - we will rebuild it.

        for term in terms:
            if term not in self.stopwords:
                query.terms = "{0} {1}".format(query.terms, term)

        query.terms = query.terms.strip()
        query.terms = unicode(query.terms)

        if len(query.terms.split()) == 1:
            query.parsed_terms = unicode(query.terms)
        else:
            query.parsed_terms = self.parser.parse(query.terms)

    def __update_scores(self, doc_scores, doc_term_scores):
        """
        Updates the doc_scores dictionary with the rankings from doc_term_scores.
        It's a cumulative function - meaning that doc_scores will have a cumulative total for each document for each term.
        """
        for i in doc_term_scores:
            if i in doc_scores:
                doc_scores[i] = doc_scores[i] + doc_term_scores[i]
            else:
                doc_scores[i] = doc_term_scores[i]