def _apply_cache(self, indexpath, cachepath, cache_id): idx = IndexerConnection(indexpath) cm = XapianCacheManager(cachepath, id=cache_id) idx.set_cache_manager(cm) idx.apply_cached_items() idx.close()
def _create_index(self, indexpath): iconn = IndexerConnection(indexpath) iconn.add_field_action("field", FieldActions.INDEX_FREETEXT, language="en") documents = [ ("1", [("term_a", 1), ("term_b", 5)]), ("2", [("term_a", 2), ("term_b", 4)]), ("3", [("term_a", 3), ("term_b", 3)]), ("4", [("term_a", 4), ("term_b", 2)]), ("5", [("term_a", 5), ("term_b", 1)]), ] for docid, terms in documents: pdoc = self._create_processed_doc(iconn, docid, terms) iconn.replace(pdoc, xapid=docid) iconn.flush() iconn.close()
def __init__(self, dirname): self.dbPath = os.path.abspath(dirname) self.dbconn = IndexerConnection(self.dbPath) self.dbconn.add_field_action('title', FieldActions.INDEX_FREETEXT, weight=5, language='en') self.dbconn.add_field_action('text', FieldActions.INDEX_FREETEXT, language='en', spell=True, stop=STOPWORDS) #self.dbconn.add_field_action('citecnt', FieldActions.FACET, type='float') #self.dbconn.add_field_action('citecnt', FieldActions.WEIGHT) self.lock = threading.Lock() for k in FIELD_NUM.keys(): self.dbconn.add_field_action(k, FieldActions.STORE_CONTENT)
def get_connection(path, indexer=False, callback=None): """Get a connection to the database. This function reuses already existing connections. """ global _index_connection, _search_connections try: _connection_attemts = _new = 0 connection = None while _connection_attemts <= 3: try: if indexer: if _index_connection is None: _new = True _index_connection = IndexerConnection(path) connection = _index_connection else: thread = get_current_thread() if thread not in _search_connections: _new = True _search_connections[ thread] = connection = SearchConnection(path) else: connection = _search_connections[thread] except (xapian.DatabaseOpeningError, xapian.DatabaseLockError): time.sleep(0.5) _connection_attemts += 1 else: break if callback: callback(connection) if not _new: connection.reopen() yield connection finally: if connection is not None: connection.close() _index_connection = None
def test_multiple_cache(self): with tempdir() as basepath: # create an index indexpath = os.path.join(basepath, "test_index") self._create_index(indexpath) base_cachepath = os.path.join(basepath, "cache") os.makedirs(base_cachepath) # create and apply cache 1 cachepath1 = os.path.join(base_cachepath, "1") self._create_and_apply_cache(indexpath, cachepath1, "1") # create and apply cache 2 cachepath2 = os.path.join(base_cachepath, "2") self._create_and_apply_cache(indexpath, cachepath2, "cache2") # test cache 1 self._check_cache_results( indexpath, cachepath1, "1", [["5", "4", "3", "2", "1"], ["4", "2", "5", "3", "1"]] ) # test cache 2 self._check_cache_results( indexpath, cachepath2, "cache2", [["5", "4", "3", "2", "1"], ["3", "4", "1", "5", "2"]] ) # the document whose docid is 4 is in both caches, we're # testing here if replacing it with one cache manager set # will change the result in the other cache. It must change. # replace document iconn = IndexerConnection(indexpath) cm = XapianCacheManager(cachepath2, id="cache2") iconn.set_cache_manager(cm) docid, terms = ("4", [("term_a", 4), ("term_b", 2)]) pdoc = self._create_processed_doc(iconn, docid, terms) iconn.replace(pdoc, xapid=int(docid)) iconn.flush() iconn.close() cm.close() # check if the results in both caches are ok self._check_cache_results( indexpath, cachepath1, "1", [["5", "4", "3", "2", "1"], ["4", "2", "5", "3", "1"]] ) self._check_cache_results( indexpath, cachepath2, "cache2", [["5", "4", "3", "2", "1"], ["3", "4", "1", "5", "2"]] ) # there are 2 code pathes when we deal with caches: # 1. the cache has not enough results # 2. the cache has enough results # in the first case, the result will come from a mixed query # against the index. In the second, the results will come from # the cache_manger. So, the cache managers must be updated. # When using multiple cache_manager, the deletion must be # explicitly done in each cache, and then we must ask for the # delete method to ignore cache (not try to update it). A better # approach for this will be developed. # remove document iconn = IndexerConnection(indexpath) cm = XapianCacheManager(cachepath1, id="1") iconn.set_cache_manager(cm) iconn._remove_cached_items(xapid=4) cm = XapianCacheManager(cachepath2, id="cache2") iconn.set_cache_manager(cm) iconn._remove_cached_items(xapid=4) cm.close() iconn.delete(xapid=4, ignore_cache=True) iconn.flush() iconn.close() # cache has not enough results self._check_cache_results(indexpath, cachepath1, "1", [["5", "3", "2", "1"], ["2", "5", "3", "1"]]) self._check_cache_results(indexpath, cachepath2, "cache2", [["5", "3", "2", "1"], ["3", "1", "5", "2"]]) # cache has enough results self._check_cache_results(indexpath, cachepath1, "1", [["5"], ["2"]], num_results=1) self._check_cache_results(indexpath, cachepath2, "cache2", [["5", "3"], ["3", "1"]], num_results=2)
class XapianIndexer(object): def __init__(self, dirname): self.dbPath = os.path.abspath(dirname) self.dbconn = IndexerConnection(self.dbPath) self.dbconn.add_field_action('title', FieldActions.INDEX_FREETEXT, weight=5, language='en') self.dbconn.add_field_action('text', FieldActions.INDEX_FREETEXT, language='en', spell=True, stop=STOPWORDS) #self.dbconn.add_field_action('citecnt', FieldActions.FACET, type='float') #self.dbconn.add_field_action('citecnt', FieldActions.WEIGHT) self.lock = threading.Lock() for k in FIELD_NUM.keys(): self.dbconn.add_field_action(k, FieldActions.STORE_CONTENT) def add_doc(self, doc): """ doc: a dict """ content = doc['text'] document = UnprocessedDocument() document.fields.append(Field('text', content)) for k, v in doc.iteritems(): if k in ['text', 'id']: continue if type(v) == list: for item in v: document.fields.append(Field(k, ensure_unicode(item))) else: document.fields.append(Field(k, ensure_unicode(v))) document.id = str(doc['id']) try: self.lock.acquire() self.dbconn.add(document) except errors.IndexerError as e: print str(e) finally: self.lock.release() def flush(self): self.dbconn.flush() def close(self): self.dbconn.close() def clear(self): self.close() shutil.rmtree(self.dbPath) self.__init__(self.dbPath)
def _apply_cache(self, indexpath, cm): idx = IndexerConnection(indexpath) idx.set_cache_manager(cm) idx.apply_cached_items() idx.close()
def test_multiple_cache(self): with tempdir() as basepath: # create an index indexpath = os.path.join(basepath, "test_index") self._create_index(indexpath) base_cachepath = os.path.join(basepath, "cache") os.makedirs(base_cachepath) # create and apply cache 1 cache_manager = XapianMultipleCachesManager(base_cachepath) cache_manager.add_cache("1") cache_manager.select_cache("1") self._create_and_apply_cache(indexpath, cache_manager) # create and apply cache 2 cache_manager.add_cache("cache2") cache_manager.select_cache("cache2") self._create_and_apply_cache(indexpath, cache_manager) # test cache 1 self._check_cache_results( indexpath, base_cachepath, "1", [["5", "4", "3", "2", "1"], ["4", "2", "5", "3", "1"]] ) # test cache 2 self._check_cache_results( indexpath, base_cachepath, "cache2", [["5", "4", "3", "2", "1"], ["3", "4", "1", "5", "2"]] ) # the document whose docid is 4 is in both caches, we're # testing here if replacing it with one cache manager set # will change the result in the other cache. It must change. # replace document iconn = IndexerConnection(indexpath) cache_manager = XapianMultipleCachesManager(base_cachepath) cache_manager.add_cache("1") cache_manager.add_cache("cache2") iconn.set_cache_manager(cache_manager) docid, terms = ("4", [("term_a", 4), ("term_b", 2)]) pdoc = self._create_processed_doc(iconn, docid, terms) iconn.replace(pdoc, xapid=int(docid)) iconn.flush() iconn.close() cache_manager.close() # check if the results in both caches are ok self._check_cache_results( indexpath, base_cachepath, "1", [["5", "4", "3", "2", "1"], ["4", "2", "5", "3", "1"]] ) self._check_cache_results( indexpath, base_cachepath, "cache2", [["5", "4", "3", "2", "1"], ["3", "4", "1", "5", "2"]] ) # there are 2 code pathes when we deal with caches: # 1. the cache has not enough results # 2. the cache has enough results # in the first case, the result will come from a mixed query # against the index. In the second, the results will come from # the cache_manger. So, all the cache managers must be updated. # remove document cache_manager = XapianMultipleCachesManager(base_cachepath) cache_manager.add_cache("1") cache_manager.add_cache("cache2") iconn = IndexerConnection(indexpath) iconn.set_cache_manager(cache_manager) iconn.delete(xapid=4) cache_manager.close() iconn.flush() iconn.close() # cache has not enough results self._check_cache_results(indexpath, base_cachepath, "1", [["5", "3", "2", "1"], ["2", "5", "3", "1"]]) self._check_cache_results(indexpath, base_cachepath, "cache2", [["5", "3", "2", "1"], ["3", "1", "5", "2"]]) # cache has enough results self._check_cache_results(indexpath, base_cachepath, "1", [["5"], ["2"]], num_results=1) self._check_cache_results(indexpath, base_cachepath, "cache2", [["5", "3"], ["3", "1"]], num_results=2)