def reindex(self, batch=1000, skip=0): """ find all contentish objects (meaning all objects derived from one of the catalog mixin classes) and (re)indexes them """ manager = queryUtility(ISolrConnectionManager) proc = SolrIndexProcessor(manager) conn = manager.getConnection() zodb_conn = self.context._p_jar log = self.mklog() log('reindexing solr catalog...\n') if skip: log('skipping indexing of %d object(s)...\n' % skip) real = timer() # real time lap = timer() # real lap time (for intermediate commits) cpu = timer(clock) # cpu time processed = 0 schema = manager.getSchema() key = schema.uniqueKey updates = {} # list to hold data to be updated flush = lambda: conn.flush() flush = notimeout(flush) def checkPoint(): for boost_values, data in updates.values(): conn.add(boost_values=boost_values, **data) updates.clear() msg = 'intermediate commit (%d items processed, ' \ 'last batch in %s)...\n' % (processed, lap.next()) log(msg) logger.info(msg) flush() zodb_conn.cacheGC() cpi = checkpointIterator(checkPoint, batch) count = 0 for path, obj in findObjects(self.context): if indexable(obj): if getOwnIndexMethod(obj, 'indexObject') is not None: log('skipping indexing of %r via private method.\n' % obj) continue count += 1 if count <= skip: continue data, missing = proc.getData(obj) prepareData(data) if not missing: value = data.get(key, None) if value is not None: updates[value] = (boost_values(obj, data), data) processed += 1 cpi.next() else: log('missing data, skipping indexing of %r.\n' % obj) checkPoint() conn.commit() log('solr index rebuilt.\n') msg = 'processed %d items in %s (%s cpu time).' msg = msg % (processed, real.next(), cpu.next()) log(msg) logger.info(msg)
def setUp(self): provideUtility(SolrConnectionConfig(), ISolrConnectionConfig) self.mngr = SolrConnectionManager() self.mngr.setHost(active=True) conn = self.mngr.getConnection() fakehttp(conn, getData('schema.xml')) # fake schema response self.mngr.getSchema() # read and cache the schema self.proc = SolrIndexProcessor(self.mngr)
def setUp(self): self.mngr = SolrConnectionManager() self.mngr.setHost(active=True) conn = self.mngr.getConnection() fakehttp(conn, getData('schema.xml')) # fake schema response self.mngr.getSchema() # read and cache the schema self.proc = SolrIndexProcessor(self.mngr) config = getConfig() config.atomic_updates = True
def testTwoRequests(self): mngr = SolrConnectionManager(active=True) proc = SolrIndexProcessor(mngr) output = fakehttp(mngr.getConnection(), getData('schema.xml'), getData('add_response.txt')) proc.index(self.foo) mngr.closeConnection() self.assertEqual(len(output), 2) self.failUnless(output.get().startswith(self.schema_request)) self.assertEqual(sortFields(output.get()), getData('add_request.txt'))
def setUp(self): provideUtility(SolrConnectionConfig(), ISolrConnectionConfig) self.mngr = SolrConnectionManager() self.mngr.setHost(active=True) self.conn = self.mngr.getConnection() self.proc = SolrIndexProcessor(self.mngr) self.log = [] # catch log messages... def logger(*args): self.log.extend(args) logger_indexer.warning = logger
def reindex(self, batch=1000, skip=0, limit=0, ignore_portal_types=None, only_portal_types=None, idxs=[], ignore_exceptions=False): """ find all contentish objects (meaning all objects derived from one of the catalog mixin classes) and (re)indexes them """ if ignore_portal_types and only_portal_types: raise ValueError("It is not possible to combine " "ignore_portal_types with only_portal_types") atomic = idxs != [] manager = queryUtility(ISolrConnectionManager) proc = SolrIndexProcessor(manager) conn = manager.getConnection() zodb_conn = self.context._p_jar log = self.mklog() log('reindexing solr catalog...\n') if skip: log('skipping indexing of %d object(s)...\n' % skip) if limit: log('limiting indexing to %d object(s)...\n' % limit) real = timer() # real time lap = timer() # real lap time (for intermediate commits) cpu = timer(clock) # cpu time processed = 0 schema = manager.getSchema() key = schema.uniqueKey updates = {} # list to hold data to be updated flush = lambda: conn.commit(soft=True) flush = notimeout(flush) def checkPoint(): for my_boost_values, data in updates.values(): adder = data.pop('_solr_adder') try: adder(conn, boost_values=my_boost_values, **data) except Exception, e: logger.warn('Error %s @ %s', e, data['path_string']) if not ignore_exceptions: raise updates.clear() msg = 'intermediate commit (%d items processed, ' \ 'last batch in %s)...\n' % (processed, lap.next()) log(msg) logger.info(msg) flush() zodb_conn.cacheGC()
def setUp(self): self.mngr = SolrConnectionManager() self.mngr.setHost(active=True) self.conn = self.mngr.getConnection() self.proc = SolrIndexProcessor(self.mngr) self.log = [] # catch log messages... def logger(*args): self.log.extend(args) logger_indexer.warning = logger config = getConfig() config.atomic_updates = True
def testLocalConnections(self): config = getConfig() config.atomic_updates = True mngr = SolrConnectionManager(active=True) proc = SolrIndexProcessor(mngr) mngr.setHost(active=True) schema = getData("schema.xml") log = [] def runner(): # fake schema response on solr connection - caches the schema fakehttp(mngr.getConnection(), getData("schema.xml")) mngr.getConnection().get_schema() fakehttp(mngr.getConnection(), schema) # fake schema response # read and cache the schema mngr.getSchema() response = getData("add_response.txt") # fake add response output = fakehttp(mngr.getConnection(), response) # indexing sends data proc.index(Foo(id="500", name="python test doc")) mngr.closeConnection() log.append(str(output)) log.append(proc) log.append(mngr.getConnection()) # after the runner was set up, another thread can be created and # started; its output should contain the proper indexing request, # whereas the main thread's connection remain idle; the latter # cannot be checked directly, but the connection object would raise # an exception if it was used to send a request without setting up # a fake response beforehand... thread = Thread(target=runner) thread.start() thread.join() conn = mngr.getConnection() # get this thread's connection fakehttp(conn, schema) # fake schema response mngr.getSchema() # read and cache the schema mngr.closeConnection() mngr.setHost(active=False) self.assertEqual(len(log), 3) self.assertEqual( sortFields(log[0].encode("utf-8")), getData("add_request.txt").rstrip(b"\n") ) self.failUnless(isinstance(log[1], SolrIndexProcessor)) self.failUnless(isinstance(log[2], SolrConnection)) self.failUnless(isinstance(proc, SolrIndexProcessor)) self.failUnless(isinstance(conn, SolrConnection)) self.assertEqual(log[1], proc) # processors should be the same... self.assertNotEqual(log[2], conn) # but not the connections
def testExtraRequest(self): # basically the same as `testThreeRequests`, except it # tests adding fake responses consecutively mngr = SolrConnectionManager(active=True) proc = SolrIndexProcessor(mngr) conn = mngr.getConnection() output = fakehttp(conn, getData('schema.xml')) fakemore(conn, getData('add_response.txt')) proc.index(self.foo) fakemore(conn, getData('delete_response.txt')) proc.unindex(self.foo) mngr.closeConnection() self.assertEqual(len(output), 3) self.failUnless(output.get().startswith(self.schema_request)) self.assertEqual(sortFields(output.get()), getData('add_request.txt')) self.assertEqual(output.get(), getData('delete_request.txt'))
def testThreeRequests(self): mngr = SolrConnectionManager(active=True) proc = SolrIndexProcessor(mngr) output = fakehttp( mngr.getConnection(), getData("schema.xml"), getData("add_response.txt"), getData("delete_response.txt"), ) proc.index(self.foo) proc.unindex(self.foo) mngr.closeConnection() self.assertEqual(len(output), 3) self.failUnless(output.get().decode("utf-8").startswith(self.schema_request)) self.assertEqual( sortFields(output.get()), getData("add_request.txt").rstrip(b"\n") ) self.assertEqual(output.get(), getData("delete_request.txt").rstrip(b"\n"))
def sync(self, batch=1000): """Sync the Solr index with the portal catalog. Records contained in the catalog but not in Solr will be indexed and records not contained in the catalog will be removed. """ manager = queryUtility(ISolrConnectionManager) proc = SolrIndexProcessor(manager) conn = manager.getConnection() key = queryUtility(ISolrConnectionManager).getSchema().uniqueKey zodb_conn = self.context._p_jar catalog = getToolByName(self.context, 'portal_catalog') getIndex = catalog._catalog.getIndex modified_index = getIndex('modified') uid_index = getIndex(key) log = self.mklog() real = timer() # real time lap = timer() # real lap time (for intermediate commits) cpu = timer(clock) # cpu time # get Solr status query = '+%s:[* TO *]' % key response = conn.search(q=query, rows=MAX_ROWS, fl='%s modified' % key) # avoid creating DateTime instances simple_unmarshallers = unmarshallers.copy() simple_unmarshallers['date'] = parse_date_as_datetime flares = SolrResponse(response, simple_unmarshallers) response.close() solr_results = {} solr_uids = set() def _utc_convert(value): t_tup = value.utctimetuple() return ((( (t_tup[0] * 12 + t_tup[1]) * 31 + t_tup[2]) * 24 + t_tup[3]) * 60 + t_tup[4]) for flare in flares: uid = flare[key] solr_uids.add(uid) solr_results[uid] = _utc_convert(flare['modified']) # get catalog status cat_results = {} cat_uids = set() for uid, rid in uid_index._index.items(): cat_uids.add(uid) cat_results[uid] = rid # differences index = cat_uids.difference(solr_uids) solr_uids.difference_update(cat_uids) unindex = solr_uids processed = 0 flush = notimeout(lambda: conn.flush()) def checkPoint(): msg = 'intermediate commit (%d items processed, ' \ 'last batch in %s)...\n' % (processed, lap.next()) log(msg) logger.info(msg) flush() zodb_conn.cacheGC() cpi = checkpointIterator(checkPoint, batch) # Look up objects uid_rid_get = cat_results.get rid_path_get = catalog._catalog.paths.get catalog_traverse = catalog.unrestrictedTraverse def lookup(uid, rid=None, uid_rid_get=uid_rid_get, rid_path_get=rid_path_get, catalog_traverse=catalog_traverse): if rid is None: rid = uid_rid_get(uid) if not rid: return None if not isinstance(rid, int): rid = tuple(rid)[0] path = rid_path_get(rid) if not path: return None try: obj = catalog_traverse(path) except AttributeError: return None return obj log('processing %d "unindex" operations next...\n' % len(unindex)) op = notimeout(lambda uid: conn.delete(id=uid)) for uid in unindex: obj = lookup(uid) if obj is None: op(uid) processed += 1 cpi.next() else: log('not unindexing existing object %r.\n' % uid) log('processing %d "index" operations next...\n' % len(index)) op = notimeout(lambda obj: proc.index(obj)) for uid in index: obj = lookup(uid) if indexable(obj): op(obj) processed += 1 cpi.next() else: log('not indexing unindexable object %r.\n' % uid) if obj is not None: obj._p_deactivate() log('processing "reindex" operations next...\n') op = notimeout(lambda obj: proc.reindex(obj)) cat_mod_get = modified_index._unindex.get solr_mod_get = solr_results.get done = unindex.union(index) for uid, rid in cat_results.items(): if uid in done: continue if isinstance(rid, IITreeSet): rid = rid.keys()[0] if cat_mod_get(rid) != solr_mod_get(uid): obj = lookup(uid, rid=rid) if indexable(obj): op(obj) processed += 1 cpi.next() else: log('not reindexing unindexable object %r.\n' % uid) if obj is not None: obj._p_deactivate() conn.commit() log('solr index synced.\n') msg = 'processed %d object(s) in %s (%s cpu time).' msg = msg % (processed, real.next(), cpu.next()) log(msg) logger.info(msg)
def cleanup(self, batch=1000): """remove entries from solr that don't have a corresponding Zope object or have a different UID than the real object""" manager = queryUtility(ISolrConnectionManager) proc = SolrIndexProcessor(manager) conn = manager.getConnection() log = self.mklog(use_std_log=True) log("cleaning up solr index...\n") key = manager.getSchema().uniqueKey start = 0 resp = SolrResponse(conn.search(q="*:*", rows=batch, start=start)) res = resp.results() log("%s items in solr catalog\n" % resp.response.numFound) deleted = 0 reindexed = 0 while len(res) > 0: for flare in res: try: ob = PloneFlare(flare).getObject() except Exception as err: log("Error getting object, removing: %s (%s)\n" % (flare["path_string"], err)) conn.delete(flare[key]) deleted += 1 continue if ob is None: log("Object not found, removing: %s\n" % (flare["path_string"])) conn.delete(flare[key]) deleted += 1 continue if not IUUIDAware.providedBy(ob): no_skipping_msg = ("Object %s of type %s does not " + "support uuids, skipping.\n") log(no_skipping_msg % ("/".join(ob.getPhysicalPath()), ob.meta_type)) continue uuid = IUUID(ob) if uuid != flare[key]: log("indexed under wrong UID, removing: %s\n" % flare["path_string"]) conn.delete(flare[key]) deleted += 1 realob_res = SolrResponse( conn.search(q="%s:%s" % (key, uuid))).results() if len(realob_res) == 0: log("no sane entry for last object, reindexing\n") data, missing = proc.getData(ob) prepareData(data) if not missing: boost = boost_values(ob, data) conn.add(boost_values=boost, **data) reindexed += 1 else: log(" missing data, cannot index.\n") log("handled batch of %d items, committing\n" % len(res)) conn.commit() start += batch resp = SolrResponse(conn.search(q="*:*", rows=batch, start=start)) res = resp.results() finished_msg = ("solr cleanup finished, %s item(s) removed, " + "%s item(s) reindexed\n") msg = finished_msg % (deleted, reindexed) log(msg) logger.info(msg)
def reindex( self, batch=1000, skip=0, limit=0, ignore_portal_types=None, only_portal_types=None, idxs=[], ignore_exceptions=False, ): """find all contentish objects (meaning all objects derived from one of the catalog mixin classes) and (re)indexes them""" if ignore_portal_types and only_portal_types: raise ValueError("It is not possible to combine " "ignore_portal_types with only_portal_types") atomic = idxs != [] manager = queryUtility(ISolrConnectionManager) proc = SolrIndexProcessor(manager) conn = manager.getConnection() zodb_conn = self.context._p_jar log = self.mklog() log("reindexing solr catalog...\n") if skip: log("skipping indexing of %d object(s)...\n" % skip) if limit: log("limiting indexing to %d object(s)...\n" % limit) real = timer() # real time lap = timer() # real lap time (for intermediate commits) cpu = timer(process_time) # cpu time processed = 0 schema = manager.getSchema() key = schema.uniqueKey updates = {} # list to hold data to be updated def flush(): return conn.commit(soft=True) flush = notimeout(flush) def checkPoint(): for my_boost_values, data in updates.values(): adder = data.pop("_solr_adder") try: adder(conn, boost_values=my_boost_values, **data) except Exception as e: logger.warning("Error %s @ %s", e, data["path_string"]) if not ignore_exceptions: raise updates.clear() msg = ("intermediate commit (%d items processed, " "last batch in %s)...\n" % (processed, next(lap))) log(msg) logger.info(msg) flush() zodb_conn.cacheGC() cpi = checkpointIterator(checkPoint, batch) count = 0 if atomic: log("indexing only {0} \n".format(idxs)) for path, obj in findObjects(self.context): if ICheckIndexable(obj)(): if getOwnIndexMethod: if getOwnIndexMethod(obj, "indexObject") is not None: log("skipping indexing of %r via private method.\n" % obj) continue count += 1 if count <= skip: continue if ignore_portal_types: if obj.portal_type in ignore_portal_types: continue if only_portal_types: if obj.portal_type not in only_portal_types: continue attributes = None if atomic: attributes = idxs # For atomic updates to work the uniqueKey must be present # in *every* update operation. if attributes and key not in attributes: attributes.append(key) data, missing = proc.getData(obj, attributes=attributes) prepareData(data) if not missing or atomic: value = data.get(key, None) if value is not None: log("indexing %r\n" % obj) pt = data.get("portal_type", "default") adder = queryAdapter(obj, ISolrAddHandler, name=pt) if adder is None: adder = DefaultAdder(obj) data["_solr_adder"] = adder updates[value] = (boost_values(obj, data), data) processed += 1 next(cpi) else: log("missing data, skipping indexing of %r.\n" % obj) if limit and count >= (skip + limit): break checkPoint() conn.commit() log("solr index rebuilt.\n") msg = "processed %d items in %s (%s cpu time)." msg = msg % (processed, next(real), next(cpu)) log(msg) logger.info(msg)
def reindex(self, batch=1000, skip=0, limit=0, ignore_portal_types=None, only_portal_types=None): """ find all contentish objects (meaning all objects derived from one of the catalog mixin classes) and (re)indexes them """ if ignore_portal_types and only_portal_types: raise ValueError("It is not possible to combine " "ignore_portal_types with only_portal_types") manager = queryUtility(ISolrConnectionManager) proc = SolrIndexProcessor(manager) conn = manager.getConnection() zodb_conn = self.context._p_jar log = self.mklog() log('reindexing solr catalog...\n') if skip: log('skipping indexing of %d object(s)...\n' % skip) if limit: log('limiting indexing to %d object(s)...\n' % limit) real = timer() # real time lap = timer() # real lap time (for intermediate commits) cpu = timer(clock) # cpu time processed = 0 schema = manager.getSchema() key = schema.uniqueKey updates = {} # list to hold data to be updated flush = lambda: conn.flush() flush = notimeout(flush) def checkPoint(): for my_boost_values, data in updates.values(): adder = data.pop('_solr_adder') adder(conn, boost_values=my_boost_values, **data) updates.clear() msg = 'intermediate commit (%d items processed, ' \ 'last batch in %s)...\n' % (processed, lap.next()) log(msg) logger.info(msg) flush() zodb_conn.cacheGC() cpi = checkpointIterator(checkPoint, batch) count = 0 for path, obj in findObjects(self.context): if ICheckIndexable(obj)(): count += 1 if count <= skip: continue if ignore_portal_types: if obj.portal_type in ignore_portal_types: continue if only_portal_types: if obj.portal_type not in only_portal_types: continue data, missing = proc.getData(obj) prepareData(data) if not missing: value = data.get(key, None) if value is not None: log('indexing %r\n' % obj) pt = data.get('portal_type', 'default') adder = queryAdapter(obj, ISolrAddHandler, name=pt) if adder is None: adder = DefaultAdder(obj) data['_solr_adder'] = adder updates[value] = (boost_values(obj, data), data) processed += 1 cpi.next() else: log('missing data, skipping indexing of %r.\n' % obj) if limit and count >= (skip + limit): break checkPoint() conn.commit() log('solr index rebuilt.\n') msg = 'processed %d items in %s (%s cpu time).' msg = msg % (processed, real.next(), cpu.next()) log(msg) logger.info(msg)