Exemplo n.º 1
0
    def __init__(self, whoosh_index_dir='', stopwords_file='', cache_host='localhost', cache_port=6379, **kwargs):
        Engine.__init__(self, **kwargs)

        self.whoosh_index_dir = whoosh_index_dir
        if not self.whoosh_index_dir:
            raise EngineConnectionException(self.name, "'whoosh_index_dir=' keyword argument not specified")

        self.stopwords_file = stopwords_file
        if self.stopwords_file:
            self.stopwords = ListReader(self.stopwords_file)  # Open the stopwords file, read into a ListReader
        else:
            raise EngineConnectionException(self.name, "'stopwords_file=' keyword argument not specified")

        self.scoring_model_identifier = 1
        self.scoring_model = scoring.PL2(c=10.0)
        
        self.__verbose = False

        try:
            self.doc_index = open_dir(self.whoosh_index_dir)
            self.reader = self.doc_index.reader()
            self.parser = QueryParser('content', self.doc_index.schema)  # By default, we use AND grouping.
                                                                         # Use the grouping parameter and specify whoosh.qparser.OrGroup, etc...

            #  Objects required for document snippet generation
            self.analyzer = self.doc_index.schema[self.parser.fieldname].analyzer
            self.fragmenter = ContextFragmenter(maxchars=200, surround=40)
            self.formatter = HtmlFormatter()
        except EmptyIndexError:
            message = "Could not open Whoosh index at '{0}'".format(self.whoosh_index_dir)
            raise EngineConnectionException(self.name, message)
        except OSError:
            message = "Could not open Whoosh index at '{0}' - directory does not exist".format(self.whoosh_index_dir)
            raise EngineConnectionException(self.name, message)

        # Attempt to connect to the specified Redis cache.
        self.cache = RedisConn(host=cache_host, port=cache_port)
        self.cache.connect()
Exemplo n.º 2
0
    def __init__(self, cache_host, cache_port, whoosh_index,
                 scoring_model_identifier, parser, analyzer, fragmenter,
                 formatter, cache_forward_look):
        """
        Constructor for an instance of the PageCacheController.
        """
        super(PageCacheController, self).__init__()
        self.__queue = Queue.Queue()
        self.__ticks_before_death = 60  # How many ticks the loop should do before dying off.

        #  Whoosh setup
        self.__reader = whoosh_index.reader()
        self.__analyzer = analyzer
        self.__fragmenter = fragmenter
        self.__formatter = formatter

        #  Cache setup
        self.__cache = RedisConn(host=cache_host, port=cache_port)
        self.__cache.connect()

        # Misc.
        self.__scoring_model_identifier = scoring_model_identifier
        self.__parser = parser
        self.__cache_forward_look = cache_forward_look
Exemplo n.º 3
0
    def __init__(self,
                 whoosh_index_dir='',
                 use_cache=True,
                 cache_host='localhost',
                 cache_port=6379,
                 **kwargs):
        """
        Constructor for the engine.
        """
        Engine.__init__(self, **kwargs)

        self.whoosh_index_dir = whoosh_index_dir
        if not self.whoosh_index_dir:
            raise EngineConnectionException(
                self.name,
                "'whoosh_index_dir=' keyword argument not specified")

        #  Only put PL2 in for now (for more, add the model parameter to the constructor to specify!)
        self.scoring_model_identifier = 1
        self.scoring_model = scoring.PL2(c=10.0)

        try:
            self.doc_index = open_dir(self.whoosh_index_dir)
            self.reader = self.doc_index.reader()
            self.parser = QueryParser(
                'content',
                self.doc_index.schema)  # By default, we use AND grouping.
            # Use the grouping parameter and specify whoosh.qparser.OrGroup, etc...

            #  Objects required for document snippet generation
            self.analyzer = self.doc_index.schema[
                self.parser.fieldname].analyzer
            self.fragmenter = ContextFragmenter(maxchars=200, surround=40)
            self.formatter = HtmlFormatter()
        except EmptyIndexError:
            message = "Could not open Whoosh index at '{0}'".format(
                self.whoosh_index_dir)
            raise EngineConnectionException(self.name, message)
        except OSError:
            message = "Could not open Whoosh index at '{0}' - directory does not exist".format(
                self.whoosh_index_dir)
            raise EngineConnectionException(self.name, message)

        self.use_cache = use_cache
        if self.use_cache:
            self.cache = RedisConn(host=cache_host, port=cache_port)
            self.cache.connect()

            self.page_cache_forward_look = 40  # How many additional pages to cache when required.
            self.page_cache_when = 4  # When the user is x pages away from the end of the page cache, cache more pages.

            self.page_cache_controller = PageCacheController(
                cache_host=self.cache.host,
                cache_port=self.cache.port,
                whoosh_index=self.doc_index,
                scoring_model_identifier=self.scoring_model_identifier,
                parser=self.parser,
                analyzer=self.analyzer,
                fragmenter=self.fragmenter,
                formatter=self.formatter,
                cache_forward_look=self.page_cache_forward_look)
Exemplo n.º 4
0
def get_redis_connection():
    rc = RedisConn(host=REDIS_STR, port=REDIS_PORT, password=REDIS_PW)
    rc.connect()
    return rc
Exemplo n.º 5
0
result_file = os.path.join(work_dir, 'res.redis_pl2_or.435')

#  Open the index and necessary Whoosh ancillaries
ix = open_dir(whoosh_index_dir)
reader = ix.reader()
query_parser = QueryParser(field_name, schema=ix.schema, group=grouping)
print ix.schema

#  Open the input and output files for reading and writing
input_file = open(query_file, 'r')
output_file = open(result_file, 'w')


#Create a Redis Connection

rc = RedisConn()
rc.connect()


for line in input_file:
	line = line.strip()  # Remove the endline character
	line = line.partition(' ')
	
	query_num = line[0]
	query_string = line[2].strip()
	
	whoosh_query = get_query(query_parser, query_string)
	
	with ix.searcher(weighting=scoring_function) as searcher:
		doc_scores = {}