Exemplo n.º 1
0
    def __init__(
        self,
        cache_host,
        cache_port,
        whoosh_index,
        scoring_model_identifier,
        parser,
        analyzer,
        fragmenter,
        formatter,
        cache_forward_look,
    ):
        """
        Constructor for an instance of the PageCacheController.
        """
        super(PageCacheController, self).__init__()
        self.__queue = Queue.Queue()
        self.__ticks_before_death = 60  # How many ticks the loop should do before dying off.

        #  Whoosh setup
        self.__reader = whoosh_index.reader()
        self.__analyzer = analyzer
        self.__fragmenter = fragmenter
        self.__formatter = formatter

        #  Cache setup
        self.__cache = RedisConn(host=cache_host, port=cache_port)
        self.__cache.connect()

        # Misc.
        self.__scoring_model_identifier = scoring_model_identifier
        self.__parser = parser
        self.__cache_forward_look = cache_forward_look
Exemplo n.º 2
0
    def __init__(self, whoosh_index_dir="", use_cache=True, cache_host="localhost", cache_port=6379, **kwargs):
        """
        Constructor for the engine.
        """
        Engine.__init__(self, **kwargs)

        self.whoosh_index_dir = whoosh_index_dir
        if not self.whoosh_index_dir:
            raise EngineConnectionException(self.name, "'whoosh_index_dir=' keyword argument not specified")

        #  Only put PL2 in for now (for more, add the model parameter to the constructor to specify!)
        self.scoring_model_identifier = 1
        self.scoring_model = scoring.PL2(c=10.0)

        try:
            self.doc_index = open_dir(self.whoosh_index_dir)
            self.reader = self.doc_index.reader()

            self.parser = QueryParser("content", self.doc_index.schema)  # By default, we use AND grouping.
            # Use the grouping parameter and specify whoosh.qparser.OrGroup, etc...

            #  Objects required for document snippet generation
            self.analyzer = self.doc_index.schema[self.parser.fieldname].analyzer
            self.fragmenter = ContextFragmenter(maxchars=200, surround=40)
            self.formatter = HtmlFormatter()
        except EmptyIndexError:
            message = "Could not open Whoosh index at '{0}'".format(self.whoosh_index_dir)
            raise EngineConnectionException(self.name, message)
        except OSError:
            message = "Could not open Whoosh index at '{0}' - directory does not exist".format(self.whoosh_index_dir)
            raise EngineConnectionException(self.name, message)

        self.use_cache = use_cache
        if self.use_cache:
            self.cache = RedisConn(host=cache_host, port=cache_port)
            self.cache.connect()

            self.page_cache_forward_look = 40  # How many additional pages to cache when required.
            self.page_cache_when = 4  # When the user is x pages away from the end of the page cache, cache more pages.

            self.page_cache_controller = PageCacheController(
                cache_host=self.cache.host,
                cache_port=self.cache.port,
                whoosh_index=self.doc_index,
                scoring_model_identifier=self.scoring_model_identifier,
                parser=self.parser,
                analyzer=self.analyzer,
                fragmenter=self.fragmenter,
                formatter=self.formatter,
                cache_forward_look=self.page_cache_forward_look,
            )
Exemplo n.º 3
0
    def __init__(self, whoosh_index_dir='', stopwords_file='', cache_host='localhost', cache_port=6379, **kwargs):
        Engine.__init__(self, **kwargs)

        self.whoosh_index_dir = whoosh_index_dir
        if not self.whoosh_index_dir:
            raise EngineConnectionException(self.name, "'whoosh_index_dir=' keyword argument not specified")

        self.stopwords_file = stopwords_file
        if self.stopwords_file:
            self.stopwords = ListReader(self.stopwords_file)  # Open the stopwords file, read into a ListReader
        else:
            raise EngineConnectionException(self.name, "'stopwords_file=' keyword argument not specified")

        self.scoring_model_identifier = 1
        self.scoring_model = scoring.PL2(c=10.0)
        
        self.__verbose = False

        try:
            self.doc_index = open_dir(self.whoosh_index_dir)
            self.reader = self.doc_index.reader()
            self.parser = QueryParser('content', self.doc_index.schema)  # By default, we use AND grouping.
                                                                         # Use the grouping parameter and specify whoosh.qparser.OrGroup, etc...

            #  Objects required for document snippet generation
            self.analyzer = self.doc_index.schema[self.parser.fieldname].analyzer
            self.fragmenter = ContextFragmenter(maxchars=200, surround=40)
            self.formatter = HtmlFormatter()
        except EmptyIndexError:
            message = "Could not open Whoosh index at '{0}'".format(self.whoosh_index_dir)
            raise EngineConnectionException(self.name, message)
        except OSError:
            message = "Could not open Whoosh index at '{0}' - directory does not exist".format(self.whoosh_index_dir)
            raise EngineConnectionException(self.name, message)

        # Attempt to connect to the specified Redis cache.
        self.cache = RedisConn(host=cache_host, port=cache_port)
        self.cache.connect()
Exemplo n.º 4
0
    def __init__(self, cache_host, cache_port, whoosh_index,
                 scoring_model_identifier, parser, analyzer, fragmenter,
                 formatter, cache_forward_look):
        """
        Constructor for an instance of the PageCacheController.
        """
        super(PageCacheController, self).__init__()
        self.__queue = Queue.Queue()
        self.__ticks_before_death = 60  # How many ticks the loop should do before dying off.

        #  Whoosh setup
        self.__reader = whoosh_index.reader()
        self.__analyzer = analyzer
        self.__fragmenter = fragmenter
        self.__formatter = formatter

        #  Cache setup
        self.__cache = RedisConn(host=cache_host, port=cache_port)
        self.__cache.connect()

        # Misc.
        self.__scoring_model_identifier = scoring_model_identifier
        self.__parser = parser
        self.__cache_forward_look = cache_forward_look
Exemplo n.º 5
0
class PageCacheController(Thread):
    """
    A class implemented by dmax that acts as a controller for caching individual pages.
    Launched as a separate thread, contains a thread-safe Queue which is populated with page caching jobs to run.
    """
    def __init__(self, cache_host, cache_port, whoosh_index,
                 scoring_model_identifier, parser, analyzer, fragmenter,
                 formatter, cache_forward_look):
        """
        Constructor for an instance of the PageCacheController.
        """
        super(PageCacheController, self).__init__()
        self.__queue = Queue.Queue()
        self.__ticks_before_death = 60  # How many ticks the loop should do before dying off.

        #  Whoosh setup
        self.__reader = whoosh_index.reader()
        self.__analyzer = analyzer
        self.__fragmenter = fragmenter
        self.__formatter = formatter

        #  Cache setup
        self.__cache = RedisConn(host=cache_host, port=cache_port)
        self.__cache.connect()

        # Misc.
        self.__scoring_model_identifier = scoring_model_identifier
        self.__parser = parser
        self.__cache_forward_look = cache_forward_look

    def run(self):
        """
        The main body of the thread's execution; called when the thread is started.
        """
        ticks = 0  # After a certain number of ticks of inactivity, the thread will commit suicide.

        while True:
            if ticks == self.__ticks_before_death:
                print "Page cacher has timed out; dying. Bleugh!"
                break

            try:
                item = self.__queue.get(timeout=1)
                ticks = 0

                start_page = item[0]
                query = item[1]
                results = item[2]

                for curr_page in range(
                        start_page, (start_page + self.__cache_forward_look)):
                    query.skip = curr_page
                    page_results = get_page(
                        query, results
                    )  # Obtain results from the queue, not the cache.
                    # Even though the caching starts before this, there is
                    # no guarantee that the cache will be ready to service it!

                    if curr_page < page_results[
                            1]:  # If this is not true, the page looked at is greater than
                        # the highest page of results; so we do not cache.
                        page_cache_key = get_cache_key(
                            model_identifier=self.__scoring_model_identifier,
                            fieldname=self.__parser.fieldname,
                            term=query.terms,
                            key_type=2,
                            page=curr_page)

                        self.__cache.store(page_cache_key,
                                           page_results)  # Store the page.
                    else:
                        break

                print "  >>> PAGE_CACHE: Pages {0} to {1} cached for '{2}'".format(
                    start_page, curr_page, query.terms)
            except Queue.Empty:  # This is reached when we try look for an item in the queue and find nothing.
                # So we're one tick closer to death...
                ticks = ticks + 1
                continue

    def add(self, start_page, query, results):
        """
        Adds an item to the queue for processing.
        """
        self.__queue.put((start_page, query, results))
Exemplo n.º 6
0
    def __init__(self,
                 whoosh_index_dir='',
                 use_cache=True,
                 cache_host='localhost',
                 cache_port=6379,
                 **kwargs):
        """
        Constructor for the engine.
        """
        Engine.__init__(self, **kwargs)

        self.whoosh_index_dir = whoosh_index_dir
        if not self.whoosh_index_dir:
            raise EngineConnectionException(
                self.name,
                "'whoosh_index_dir=' keyword argument not specified")

        #  Only put PL2 in for now (for more, add the model parameter to the constructor to specify!)
        self.scoring_model_identifier = 1
        self.scoring_model = scoring.PL2(c=10.0)

        try:
            self.doc_index = open_dir(self.whoosh_index_dir)
            self.reader = self.doc_index.reader()
            self.parser = QueryParser(
                'content',
                self.doc_index.schema)  # By default, we use AND grouping.
            # Use the grouping parameter and specify whoosh.qparser.OrGroup, etc...

            #  Objects required for document snippet generation
            self.analyzer = self.doc_index.schema[
                self.parser.fieldname].analyzer
            self.fragmenter = ContextFragmenter(maxchars=200, surround=40)
            self.formatter = HtmlFormatter()
        except EmptyIndexError:
            message = "Could not open Whoosh index at '{0}'".format(
                self.whoosh_index_dir)
            raise EngineConnectionException(self.name, message)
        except OSError:
            message = "Could not open Whoosh index at '{0}' - directory does not exist".format(
                self.whoosh_index_dir)
            raise EngineConnectionException(self.name, message)

        self.use_cache = use_cache
        if self.use_cache:
            self.cache = RedisConn(host=cache_host, port=cache_port)
            self.cache.connect()

            self.page_cache_forward_look = 40  # How many additional pages to cache when required.
            self.page_cache_when = 4  # When the user is x pages away from the end of the page cache, cache more pages.

            self.page_cache_controller = PageCacheController(
                cache_host=self.cache.host,
                cache_port=self.cache.port,
                whoosh_index=self.doc_index,
                scoring_model_identifier=self.scoring_model_identifier,
                parser=self.parser,
                analyzer=self.analyzer,
                fragmenter=self.fragmenter,
                formatter=self.formatter,
                cache_forward_look=self.page_cache_forward_look)
Exemplo n.º 7
0
class WhooshTrecNewsRedis(Engine):
    """
    A revised Whoosh ifind engine.
    Implemented by dmax. Uses a new way of poking the postings file by @leifos, and also some tasty Redis caching.
    """
    def __init__(self,
                 whoosh_index_dir='',
                 use_cache=True,
                 cache_host='localhost',
                 cache_port=6379,
                 **kwargs):
        """
        Constructor for the engine.
        """
        Engine.__init__(self, **kwargs)

        self.whoosh_index_dir = whoosh_index_dir
        if not self.whoosh_index_dir:
            raise EngineConnectionException(
                self.name,
                "'whoosh_index_dir=' keyword argument not specified")

        #  Only put PL2 in for now (for more, add the model parameter to the constructor to specify!)
        self.scoring_model_identifier = 1
        self.scoring_model = scoring.PL2(c=10.0)

        try:
            self.doc_index = open_dir(self.whoosh_index_dir)
            self.reader = self.doc_index.reader()
            self.parser = QueryParser(
                'content',
                self.doc_index.schema)  # By default, we use AND grouping.
            # Use the grouping parameter and specify whoosh.qparser.OrGroup, etc...

            #  Objects required for document snippet generation
            self.analyzer = self.doc_index.schema[
                self.parser.fieldname].analyzer
            self.fragmenter = ContextFragmenter(maxchars=200, surround=40)
            self.formatter = HtmlFormatter()
        except EmptyIndexError:
            message = "Could not open Whoosh index at '{0}'".format(
                self.whoosh_index_dir)
            raise EngineConnectionException(self.name, message)
        except OSError:
            message = "Could not open Whoosh index at '{0}' - directory does not exist".format(
                self.whoosh_index_dir)
            raise EngineConnectionException(self.name, message)

        self.use_cache = use_cache
        if self.use_cache:
            self.cache = RedisConn(host=cache_host, port=cache_port)
            self.cache.connect()

            self.page_cache_forward_look = 40  # How many additional pages to cache when required.
            self.page_cache_when = 4  # When the user is x pages away from the end of the page cache, cache more pages.

            self.page_cache_controller = PageCacheController(
                cache_host=self.cache.host,
                cache_port=self.cache.port,
                whoosh_index=self.doc_index,
                scoring_model_identifier=self.scoring_model_identifier,
                parser=self.parser,
                analyzer=self.analyzer,
                fragmenter=self.fragmenter,
                formatter=self.formatter,
                cache_forward_look=self.page_cache_forward_look)

    def _search(self, query):
        """
        The concrete method of the Engine's search interface method, search().
        """
        self.__parse_query_terms(query)

        page_cache_key = get_cache_key(
            model_identifier=self.scoring_model_identifier,
            fieldname=self.parser.fieldname,
            term=query.terms,
            key_type=2,
            page=query.skip)

        if self.use_cache and self.cache.exists(
                page_cache_key
        ):  # If true, we have a page cached - so use this!
            page_results = self.cache.get(page_cache_key)
            highest_cached_page = self.__get_highest_cached_page(query)

            if highest_cached_page - query.skip < self.page_cache_when:  # Do we need to cache some more pages?
                self.__add_to_page_cacher((highest_cached_page + 1), query,
                                          page_results)

            return parse_response(reader=self.reader,
                                  fieldname=self.parser.fieldname,
                                  analyzer=self.analyzer,
                                  fragmenter=self.fragmenter,
                                  formatter=self.formatter,
                                  query=query,
                                  results=page_results,
                                  results_are_page=True)
        else:  # No page is cached, so we get the results for that page - and no doubt cache some more pages.
            return self._request(query)

    def _request(self, query):
        """
        Services a request that does not have a page cached.
        Returns an ifind Response object using information from either the Redis cache of Whoosh.
        """
        query_cache_key = get_cache_key(
            model_identifier=self.scoring_model_identifier,
            fieldname=self.parser.fieldname,
            term=query.terms,
            key_type=1)

        if self.use_cache and self.cache.exists(query_cache_key):
            sorted_results = self.cache.get(query_cache_key)
        else:
            with self.doc_index.searcher(
                    weighting=self.scoring_model) as searcher:
                doc_scores = {}

                if isinstance(query.parsed_terms, unicode):
                    doc_term_scores = self.__get_doc_term_scores(
                        searcher, query.parsed_terms)
                    self.__update_scores(doc_scores, doc_term_scores)
                else:
                    try:
                        for term in query.parsed_terms:
                            doc_term_scores = self.__get_doc_term_scores(
                                searcher, term.text)
                            self.__update_scores(doc_scores, doc_term_scores)
                    except NotImplementedError:
                        pass

            sorted_results = sorted(doc_scores.iteritems(),
                                    key=itemgetter(1),
                                    reverse=True)
            #sorted_results = sorted(set(doc_scores.iteritems()), key=itemgetter(1), reverse=True)

        # This block of code checks if additional page caching is required.
        # This will arise when no pages for the given query are cached, or the user is close to reaching the end of
        # the cached page collection for the given query.
        if self.use_cache:
            self.cache.store(query_cache_key, sorted_results)
            highest_cached_page = self.__get_highest_cached_page(query)

            if highest_cached_page == -1:  # Cache pages from page 1.
                self.__add_to_page_cacher(1, query, sorted_results)
            elif highest_cached_page - query.skip < self.page_cache_when:  # Start caching from page x
                self.__add_to_page_cacher((highest_cached_page + 1), query,
                                          sorted_results)

        return parse_response(reader=self.reader,
                              fieldname=self.parser.fieldname,
                              analyzer=self.analyzer,
                              fragmenter=self.fragmenter,
                              formatter=self.formatter,
                              query=query,
                              results=sorted_results)

    def __parse_query_terms(self, query):
        """
        Parses the query terms provided.
        Creates a Whoosh compound query type in query.parsed_terms if more than one term is specified.
        If only a single term is specified, a unicode string instance is used instead.
        """
        def tidy_terms(self):
            """
            Nested function to remove unwanted query terms (e.g. AND, OR, NOT) from the query.
            Also tidies the query by removing redundant whitespace and newline characters.
            """
            ignore = [
                'and', 'or', 'not', 'in', 'the', 'a', 'to'
            ]  # Terms to be ignored. These are not included in the tidied querystring.
            # Ensure the terms in the list are all lowercase!

            terms = query.terms
            terms = terms.lower()
            terms = terms.strip()
            terms = terms.split()

            query.terms = ""

            for term in terms:
                if term not in ignore:
                    query.terms = "{0} {1}".format(query.terms, term)

            query.terms = query.terms.strip()
            query.terms = unicode(query.terms)

        if not query.top:
            raise QueryParamException(
                self.name,
                "Total number of results (query.top) not specified.")

        if query.top < 1:
            raise QueryParamException(
                self.name, "Page length (query.top) must be at least 1.")

        tidy_terms(query)

        if len(query.terms.split()) == 1:
            query.parsed_terms = unicode(query.terms)
        else:
            query.parsed_terms = self.parser.parse(query.terms)

        query.terms = query.terms.strip()

    def __get_doc_term_scores(self, searcher, term):
        """
        Returns a dictionary object comprised of Whoosh document IDs for keys, and scores as values.
        The scores correspond to how relevant the given document is to the given term, provided as parameter query.
        Parameter term should be a unicode string. The Whoosh searcher instance should be provided as parameter searcher.
        """
        doc_term_scores = {}
        term_cache_key = get_cache_key(
            model_identifier=self.scoring_model_identifier,
            fieldname=self.parser.fieldname,
            term=term,
            key_type=0)

        if self.use_cache and self.cache.exists(term_cache_key):
            return self.cache.get(term_cache_key)  # That was simple!
        else:
            try:
                postings = searcher.postings(self.parser.fieldname, term)

                for i in postings.all_ids():  ## too much time
                    doc_term_scores[i] = postings.score()

            except TermNotFound:  # If the term is not found in the inverted index, do nada.
                pass

        if self.use_cache:  # If caching is enabled, cache the results. If we are here, we need to cache them!
            self.cache.store(term_cache_key, doc_term_scores)

        return doc_term_scores

    def __update_scores(self, doc_scores, doc_term_scores):
        """
        Updates the doc_scores dictionary with the rankings from doc_term_scores.
        The end result is doc_scores will have a cumulative total for each document for each term.
        """
        for i in doc_term_scores:
            if i in doc_scores:
                doc_scores[i] = doc_scores[i] + doc_term_scores[i]
            else:
                doc_scores[i] = doc_term_scores[i]

    def __get_highest_cached_page(self, query):
        """
        For a given query, returns the highest cached page number.
        For example, if pages 1-10 for a given page are cached, 10 would be returned.

        If no pages are cached for the given query, -1 is returned.
        This method assumes that pages are cached in a linear fashion - there are no gaps where pages are not cached.

        If caching is not enabled, -1 is always returned.
        """
        if not self.use_cache:
            return -1

        wildcard_key = '{0}:page:*:{1}:{2}'.format(
            self.scoring_model_identifier, self.parser.fieldname, query.terms)
        matching_keys = self.cache.keys(wildcard_key)
        highest_page = -1

        if len(matching_keys) == 0:
            return -1
        else:
            for key in matching_keys:
                key = key.split(':')
                page = int(key[2])

                if page > highest_page:
                    highest_page = page

        return highest_page

    def __add_to_page_cacher(self, start_page, query, results):
        """
        Adds a page to the queue in the caching thread.
        If the thread is not started (i.e. it died because it got old), a new thread is started.
        """
        if not self.cache:
            return

        if not self.page_cache_controller.is_alive():
            try:
                self.page_cache_controller.start()
            except RuntimeError:
                self.page_cache_controller = PageCacheController(
                    cache_host=self.cache.host,
                    cache_port=self.cache.port,
                    whoosh_index=self.doc_index,
                    scoring_model_identifier=self.scoring_model_identifier,
                    parser=self.parser,
                    analyzer=self.analyzer,
                    fragmenter=self.fragmenter,
                    formatter=self.formatter,
                    cache_forward_look=self.page_cache_forward_look)
                self.page_cache_controller.start()

        # We can only be certain here if the page caching thread is alive - so we can now add to its queue.
        self.page_cache_controller.add(start_page, query, results)
Exemplo n.º 8
0
def get_redis_connection():
    rc = RedisConn(host=REDIS_STR, port=REDIS_PORT, password=REDIS_PW)
    rc.connect()
    return rc
Exemplo n.º 9
0
result_file = os.path.join(work_dir, 'res.redis_pl2_or.435')

#  Open the index and necessary Whoosh ancillaries
ix = open_dir(whoosh_index_dir)
reader = ix.reader()
query_parser = QueryParser(field_name, schema=ix.schema, group=grouping)
print ix.schema

#  Open the input and output files for reading and writing
input_file = open(query_file, 'r')
output_file = open(result_file, 'w')


#Create a Redis Connection

rc = RedisConn()
rc.connect()


for line in input_file:
	line = line.strip()  # Remove the endline character
	line = line.partition(' ')
	
	query_num = line[0]
	query_string = line[2].strip()
	
	whoosh_query = get_query(query_parser, query_string)
	
	with ix.searcher(weighting=scoring_function) as searcher:
		doc_scores = {}
		
Exemplo n.º 10
0
class PageCacheController(Thread):
    """
    A class implemented by dmax that acts as a controller for caching individual pages.
    Launched as a separate thread, contains a thread-safe Queue which is populated with page caching jobs to run.
    """

    def __init__(
        self,
        cache_host,
        cache_port,
        whoosh_index,
        scoring_model_identifier,
        parser,
        analyzer,
        fragmenter,
        formatter,
        cache_forward_look,
    ):
        """
        Constructor for an instance of the PageCacheController.
        """
        super(PageCacheController, self).__init__()
        self.__queue = Queue.Queue()
        self.__ticks_before_death = 60  # How many ticks the loop should do before dying off.

        #  Whoosh setup
        self.__reader = whoosh_index.reader()
        self.__analyzer = analyzer
        self.__fragmenter = fragmenter
        self.__formatter = formatter

        #  Cache setup
        self.__cache = RedisConn(host=cache_host, port=cache_port)
        self.__cache.connect()

        # Misc.
        self.__scoring_model_identifier = scoring_model_identifier
        self.__parser = parser
        self.__cache_forward_look = cache_forward_look

    def run(self):
        """
        The main body of the thread's execution; called when the thread is started.
        """
        ticks = 0  # After a certain number of ticks of inactivity, the thread will commit suicide.

        while True:
            if ticks == self.__ticks_before_death:
                print "Page cacher has timed out; dying. Bleugh!"
                break

            try:
                item = self.__queue.get(timeout=1)
                ticks = 0

                start_page = item[0]
                query = item[1]
                results = item[2]

                for curr_page in range(start_page, (start_page + self.__cache_forward_look)):
                    query.skip = curr_page
                    page_results = get_page(query, results)  # Obtain results from the queue, not the cache.
                    # Even though the caching starts before this, there is
                    # no guarantee that the cache will be ready to service it!

                    if curr_page < page_results[1]:  # If this is not true, the page looked at is greater than
                        # the highest page of results; so we do not cache.
                        page_cache_key = get_cache_key(
                            model_identifier=self.__scoring_model_identifier,
                            fieldname=self.__parser.fieldname,
                            term=query.terms,
                            key_type=2,
                            page=curr_page,
                        )

                        self.__cache.store(page_cache_key, page_results)  # Store the page.
                    else:
                        break

                print "  >>> PAGE_CACHE: Pages {0} to {1} cached for '{2}'".format(start_page, curr_page, query.terms)
            except Queue.Empty:  # This is reached when we try look for an item in the queue and find nothing.
                # So we're one tick closer to death...
                ticks = ticks + 1
                continue

    def add(self, start_page, query, results):
        """
        Adds an item to the queue for processing.
        """
        self.__queue.put((start_page, query, results))
Exemplo n.º 11
0
class WhooshTrecNewsRedis(Engine):
    """
    A revised Whoosh ifind engine.
    Implemented by dmax. Uses a new way of poking the postings file by @leifos, and also some tasty Redis caching.
    """

    def __init__(self, whoosh_index_dir="", use_cache=True, cache_host="localhost", cache_port=6379, **kwargs):
        """
        Constructor for the engine.
        """
        Engine.__init__(self, **kwargs)

        self.whoosh_index_dir = whoosh_index_dir
        if not self.whoosh_index_dir:
            raise EngineConnectionException(self.name, "'whoosh_index_dir=' keyword argument not specified")

        #  Only put PL2 in for now (for more, add the model parameter to the constructor to specify!)
        self.scoring_model_identifier = 1
        self.scoring_model = scoring.PL2(c=10.0)

        try:
            self.doc_index = open_dir(self.whoosh_index_dir)
            self.reader = self.doc_index.reader()

            self.parser = QueryParser("content", self.doc_index.schema)  # By default, we use AND grouping.
            # Use the grouping parameter and specify whoosh.qparser.OrGroup, etc...

            #  Objects required for document snippet generation
            self.analyzer = self.doc_index.schema[self.parser.fieldname].analyzer
            self.fragmenter = ContextFragmenter(maxchars=200, surround=40)
            self.formatter = HtmlFormatter()
        except EmptyIndexError:
            message = "Could not open Whoosh index at '{0}'".format(self.whoosh_index_dir)
            raise EngineConnectionException(self.name, message)
        except OSError:
            message = "Could not open Whoosh index at '{0}' - directory does not exist".format(self.whoosh_index_dir)
            raise EngineConnectionException(self.name, message)

        self.use_cache = use_cache
        if self.use_cache:
            self.cache = RedisConn(host=cache_host, port=cache_port)
            self.cache.connect()

            self.page_cache_forward_look = 40  # How many additional pages to cache when required.
            self.page_cache_when = 4  # When the user is x pages away from the end of the page cache, cache more pages.

            self.page_cache_controller = PageCacheController(
                cache_host=self.cache.host,
                cache_port=self.cache.port,
                whoosh_index=self.doc_index,
                scoring_model_identifier=self.scoring_model_identifier,
                parser=self.parser,
                analyzer=self.analyzer,
                fragmenter=self.fragmenter,
                formatter=self.formatter,
                cache_forward_look=self.page_cache_forward_look,
            )

    def _search(self, query):
        """
        The concrete method of the Engine's search interface method, search().
        """
        self.__parse_query_terms(query)

        page_cache_key = get_cache_key(
            model_identifier=self.scoring_model_identifier,
            fieldname=self.parser.fieldname,
            term=query.terms,
            key_type=2,
            page=query.skip,
        )

        if self.use_cache and self.cache.exists(page_cache_key):  # If true, we have a page cached - so use this!
            page_results = self.cache.get(page_cache_key)
            highest_cached_page = self.__get_highest_cached_page(query)

            if highest_cached_page - query.skip < self.page_cache_when:  # Do we need to cache some more pages?
                self.__add_to_page_cacher((highest_cached_page + 1), query, page_results)

            return parse_response(
                reader=self.reader,
                fieldname=self.parser.fieldname,
                analyzer=self.analyzer,
                fragmenter=self.fragmenter,
                formatter=self.formatter,
                query=query,
                results=page_results,
                results_are_page=True,
            )
        else:  # No page is cached, so we get the results for that page - and no doubt cache some more pages.
            return self._request(query)

    def _request(self, query):
        """
        Services a request that does not have a page cached.
        Returns an ifind Response object using information from either the Redis cache of Whoosh.
        """
        query_cache_key = get_cache_key(
            model_identifier=self.scoring_model_identifier,
            fieldname=self.parser.fieldname,
            term=query.terms,
            key_type=1,
        )

        if self.use_cache and self.cache.exists(query_cache_key):
            sorted_results = self.cache.get(query_cache_key)
        else:
            with self.doc_index.searcher(weighting=self.scoring_model) as searcher:
                doc_scores = {}

                if isinstance(query.parsed_terms, unicode):
                    doc_term_scores = self.__get_doc_term_scores(searcher, query.parsed_terms)
                    self.__update_scores(doc_scores, doc_term_scores)
                else:
                    try:
                        for term in query.parsed_terms:
                            doc_term_scores = self.__get_doc_term_scores(searcher, term.text)
                            self.__update_scores(doc_scores, doc_term_scores)
                    except NotImplementedError:
                        pass

            sorted_results = sorted(doc_scores.iteritems(), key=itemgetter(1), reverse=True)

        # This block of code checks if additional page caching is required.
        # This will arise when no pages for the given query are cached, or the user is close to reaching the end of
        # the cached page collection for the given query.
        if self.use_cache:
            self.cache.store(query_cache_key, sorted_results)
            highest_cached_page = self.__get_highest_cached_page(query)

            if highest_cached_page == -1:  # Cache pages from page 1.
                self.__add_to_page_cacher(1, query, sorted_results)
            elif highest_cached_page - query.skip < self.page_cache_when:  # Start caching from page x
                self.__add_to_page_cacher((highest_cached_page + 1), query, sorted_results)

        return parse_response(
            reader=self.reader,
            fieldname=self.parser.fieldname,
            analyzer=self.analyzer,
            fragmenter=self.fragmenter,
            formatter=self.formatter,
            query=query,
            results=sorted_results,
        )

    def __parse_query_terms(self, query):
        """
        Parses the query terms provided.
        Creates a Whoosh compound query type in query.parsed_terms if more than one term is specified.
        If only a single term is specified, a unicode string instance is used instead.
        """

        def tidy_terms(self):
            """
            Nested function to remove unwanted query terms (e.g. AND, OR, NOT) from the query.
            Also tidies the query by removing redundant whitespace and newline characters.
            """
            ignore = [
                "and",
                "or",
                "not",
                "in",
                "the",
                "a",
                "to",
            ]  # Terms to be ignored. These are not included in the tidied querystring.
            # Ensure the terms in the list are all lowercase!

            terms = query.terms
            terms = terms.lower()
            terms = terms.strip()
            terms = terms.split()

            query.terms = ""

            for term in terms:
                if term not in ignore:
                    query.terms = "{0} {1}".format(query.terms, term)

            query.terms = query.terms.strip()
            query.terms = unicode(query.terms)

        if not query.top:
            raise QueryParamException(self.name, "Total number of results (query.top) not specified.")

        if query.top < 1:
            raise QueryParamException(self.name, "Page length (query.top) must be at least 1.")

        tidy_terms(query)

        if len(query.terms.split()) == 1:
            query.parsed_terms = unicode(query.terms)
        else:
            query.parsed_terms = self.parser.parse(query.terms)

        query.terms = query.terms.strip()

    def __get_doc_term_scores(self, searcher, term):
        """
        Returns a dictionary object comprised of Whoosh document IDs for keys, and scores as values.
        The scores correspond to how relevant the given document is to the given term, provided as parameter query.
        Parameter term should be a unicode string. The Whoosh searcher instance should be provided as parameter searcher.
        """
        doc_term_scores = {}
        term_cache_key = get_cache_key(
            model_identifier=self.scoring_model_identifier, fieldname=self.parser.fieldname, term=term, key_type=0
        )

        if self.use_cache and self.cache.exists(term_cache_key):
            return self.cache.get(term_cache_key)  # That was simple!
        else:
            try:
                postings = searcher.postings(self.parser.fieldname, term)

                for i in postings.all_ids():
                    doc_term_scores[i] = postings.score()

            except TermNotFound:  # If the term is not found in the inverted index, do nada.
                pass

        if self.use_cache:  # If caching is enabled, cache the results. If we are here, we need to cache them!
            self.cache.store(term_cache_key, doc_term_scores)

        return doc_term_scores

    def __update_scores(self, doc_scores, doc_term_scores):
        """
        Updates the doc_scores dictionary with the rankings from doc_term_scores.
        The end result is doc_scores will have a cumulative total for each document for each term.
        """
        for i in doc_term_scores:
            if i in doc_scores:
                doc_scores[i] = doc_scores[i] + doc_term_scores[i]
            else:
                doc_scores[i] = doc_term_scores[i]

    def __get_highest_cached_page(self, query):
        """
        For a given query, returns the highest cached page number.
        For example, if pages 1-10 for a given page are cached, 10 would be returned.

        If no pages are cached for the given query, -1 is returned.
        This method assumes that pages are cached in a linear fashion - there are no gaps where pages are not cached.

        If caching is not enabled, -1 is always returned.
        """
        if not self.use_cache:
            return -1

        wildcard_key = "{0}:page:*:{1}:{2}".format(self.scoring_model_identifier, self.parser.fieldname, query.terms)
        matching_keys = self.cache.keys(wildcard_key)
        highest_page = -1

        if len(matching_keys) == 0:
            return -1
        else:
            for key in matching_keys:
                key = key.split(":")
                page = int(key[2])

                if page > highest_page:
                    highest_page = page

        return highest_page

    def __add_to_page_cacher(self, start_page, query, results):
        """
        Adds a page to the queue in the caching thread.
        If the thread is not started (i.e. it died because it got old), a new thread is started.
        """
        if not self.cache:
            return

        if not self.page_cache_controller.is_alive():
            try:
                self.page_cache_controller.start()
            except RuntimeError:
                self.page_cache_controller = PageCacheController(
                    cache_host=self.cache.host,
                    cache_port=self.cache.port,
                    whoosh_index=self.doc_index,
                    scoring_model_identifier=self.scoring_model_identifier,
                    parser=self.parser,
                    analyzer=self.analyzer,
                    fragmenter=self.fragmenter,
                    formatter=self.formatter,
                    cache_forward_look=self.page_cache_forward_look,
                )
                self.page_cache_controller.start()

        # We can only be certain here if the page caching thread is alive - so we can now add to its queue.
        self.page_cache_controller.add(start_page, query, results)
Exemplo n.º 12
0
class WhooshTrecNewsRedis(Engine):
    """
    A revised Whoosh ifind engine.
    Implemented by dmax. Uses a new way of poking the postings file by @leifos, and also some tasty Redis caching.
    """
    def __init__(self, whoosh_index_dir='', stopwords_file='', cache_host='localhost', cache_port=6379, **kwargs):
        Engine.__init__(self, **kwargs)

        self.whoosh_index_dir = whoosh_index_dir
        if not self.whoosh_index_dir:
            raise EngineConnectionException(self.name, "'whoosh_index_dir=' keyword argument not specified")

        self.stopwords_file = stopwords_file
        if self.stopwords_file:
            self.stopwords = ListReader(self.stopwords_file)  # Open the stopwords file, read into a ListReader
        else:
            raise EngineConnectionException(self.name, "'stopwords_file=' keyword argument not specified")

        self.scoring_model_identifier = 1
        self.scoring_model = scoring.PL2(c=10.0)
        
        self.__verbose = False

        try:
            self.doc_index = open_dir(self.whoosh_index_dir)
            self.reader = self.doc_index.reader()
            self.parser = QueryParser('content', self.doc_index.schema)  # By default, we use AND grouping.
                                                                         # Use the grouping parameter and specify whoosh.qparser.OrGroup, etc...

            #  Objects required for document snippet generation
            self.analyzer = self.doc_index.schema[self.parser.fieldname].analyzer
            self.fragmenter = ContextFragmenter(maxchars=200, surround=40)
            self.formatter = HtmlFormatter()
        except EmptyIndexError:
            message = "Could not open Whoosh index at '{0}'".format(self.whoosh_index_dir)
            raise EngineConnectionException(self.name, message)
        except OSError:
            message = "Could not open Whoosh index at '{0}' - directory does not exist".format(self.whoosh_index_dir)
            raise EngineConnectionException(self.name, message)

        # Attempt to connect to the specified Redis cache.
        self.cache = RedisConn(host=cache_host, port=cache_port)
        self.cache.connect()

    def _search(self, query):
        """
        The concrete search method.
        """
        self.__parse_query_terms(query)

        with self.doc_index.searcher(weighting=self.scoring_model) as searcher:
            doc_scores = {}

            if isinstance(query.parsed_terms, unicode):
                t = time.time()
                doc_term_scores = self.__get_doc_term_scores(searcher, query.parsed_terms)
                t = time.time() - t
                
                if self.__verbose:
                    print "  > Retrieve results for '{0}': {1}".format(query.parsed_terms, t)

                t = time.time()
                self.__update_scores(doc_scores, doc_term_scores)
                t = time.time() - t
                
                if self.__verbose:
                    print "  >> Time to update scores: {0}".format(t)
            else:
                try:
                    for term in query.parsed_terms:
                        t = time.time()
                        doc_term_scores = self.__get_doc_term_scores(searcher, term.text)
                        t = time.time() - t
                        
                        if self.__verbose:
                            print "  > Retrieve results for '{0}': {1}".format(term, t)

                        t = time.time()
                        self.__update_scores(doc_scores, doc_term_scores)
                        t = time.time() - t
                        
                        if self.__verbose:
                            print "  >> Time to update scores: {0}".format(t)
                except NotImplementedError:
                    pass

        t = time.time()
        sorted_results = sorted(doc_scores.iteritems(), key=itemgetter(1), reverse=True)
        t = time.time() - t
        
        if self.__verbose:
            print "  > Time to sort results: {0}".format(t)

        return parse_response(reader=self.reader,
                              fieldname=self.parser.fieldname,
                              analyzer=self.analyzer,
                              fragmenter=self.fragmenter,
                              formatter=self.formatter,
                              query=query,
                              results=sorted_results)

    def __get_doc_term_scores(self, searcher, term):
        """
        Returns a dictionary object comprised of Whoosh document IDs for keys, and scores as values.
        The scores correspond to how relevant the given document is to the given term, provided as parameter query.
        Parameter term should be a unicode string. The Whoosh searcher instance should be provided as parameter searcher.
        """
        doc_term_scores = {}
        term_cache_key = get_cache_key(model_identifier=self.scoring_model_identifier,
                                       fieldname=self.parser.fieldname,
                                       term=term)

        if self.cache.exists(term_cache_key):
            if self.__verbose:
                print "  >> Results are cached"
            return self.cache.get(term_cache_key)  # Easy peasy, return the object from the cache.
        else:
            if self.__verbose:
                print "  >> Results not cached"
            try:
                postings = searcher.postings(self.parser.fieldname, term)

                for i in postings.all_ids():  # THIS IS SLOW, LEIF HAS AN ALGORITHM TO SPEED THIS UP?
                    doc_term_scores[i] = postings.score()
            except TermNotFound:
                pass

            self.cache.store(term_cache_key, doc_term_scores)

        return doc_term_scores

    def __parse_query_terms(self, query):
        """
        Using the stopwords list provided, parses the query object and prepares it for being sent to the engine.
        """

        if not query.top:
            raise QueryParamException(self.name, "Total number of results (query.top) not specified.")

        if query.top < 1:
            raise QueryParamException(self.name, "Page length (query.top) must be at least 1.")

        # Tidy up the querystring. Split it into individual terms so we can process them.
        terms = query.terms
        terms = terms.lower()
        terms = terms.strip()
        terms = terms.split()  # Chop!

        query.terms = ""  # Reset the query's terms string to a blank string - we will rebuild it.

        for term in terms:
            if term not in self.stopwords:
                query.terms = "{0} {1}".format(query.terms, term)

        query.terms = query.terms.strip()
        query.terms = unicode(query.terms)

        if len(query.terms.split()) == 1:
            query.parsed_terms = unicode(query.terms)
        else:
            query.parsed_terms = self.parser.parse(query.terms)

    def __update_scores(self, doc_scores, doc_term_scores):
        """
        Updates the doc_scores dictionary with the rankings from doc_term_scores.
        It's a cumulative function - meaning that doc_scores will have a cumulative total for each document for each term.
        """
        for i in doc_term_scores:
            if i in doc_scores:
                doc_scores[i] = doc_scores[i] + doc_term_scores[i]
            else:
                doc_scores[i] = doc_term_scores[i]