def reindex(self, batch=1000, skip=0): """ find all contentish objects (meaning all objects derived from one of the catalog mixin classes) and (re)indexes them """ manager = queryUtility(ISolrConnectionManager) proc = SolrIndexProcessor(manager) conn = manager.getConnection() zodb_conn = self.context._p_jar log = self.mklog() log('reindexing solr catalog...\n') if skip: log('skipping indexing of %d object(s)...\n' % skip) real = timer() # real time lap = timer() # real lap time (for intermediate commits) cpu = timer(clock) # cpu time processed = 0 schema = manager.getSchema() key = schema.uniqueKey updates = {} # list to hold data to be updated flush = lambda: conn.flush() flush = notimeout(flush) def checkPoint(): for boost_values, data in updates.values(): conn.add(boost_values=boost_values, **data) updates.clear() msg = 'intermediate commit (%d items processed, ' \ 'last batch in %s)...\n' % (processed, lap.next()) log(msg) logger.info(msg) flush() zodb_conn.cacheGC() cpi = checkpointIterator(checkPoint, batch) count = 0 for path, obj in findObjects(self.context): if indexable(obj): if getOwnIndexMethod(obj, 'indexObject') is not None: log('skipping indexing of %r via private method.\n' % obj) continue count += 1 if count <= skip: continue data, missing = proc.getData(obj) prepareData(data) if not missing: value = data.get(key, None) if value is not None: updates[value] = (boost_values(obj, data), data) processed += 1 cpi.next() else: log('missing data, skipping indexing of %r.\n' % obj) checkPoint() conn.commit() log('solr index rebuilt.\n') msg = 'processed %d items in %s (%s cpu time).' msg = msg % (processed, real.next(), cpu.next()) log(msg) logger.info(msg)
def reindex(self, batch=1000, skip=0): """ find all contentish objects (meaning all objects derived from one of the catalog mixin classes) and (re)indexes them """ manager = queryUtility(ISolrConnectionManager) proc = SolrIndexProcessor(manager) conn = manager.getConnection() zodb_conn = self.context._p_jar log = self.mklog() log("reindexing solr catalog...\n") if skip: log("skipping indexing of %d object(s)...\n" % skip) real = timer() # real time lap = timer() # real lap time (for intermediate commits) cpu = timer(clock) # cpu time processed = 0 schema = manager.getSchema() key = schema.uniqueKey updates = {} # list to hold data to be updated flush = lambda: conn.flush() flush = notimeout(flush) def checkPoint(): for boost_values, data in updates.values(): conn.add(boost_values=boost_values, **data) updates.clear() msg = "intermediate commit (%d items processed, " "last batch in %s)...\n" % (processed, lap.next()) log(msg) logger.info(msg) flush() zodb_conn.cacheGC() cpi = checkpointIterator(checkPoint, batch) count = 0 for path, obj in findObjects(self.context): if indexable(obj): if getOwnIndexMethod(obj, "indexObject") is not None: log("skipping indexing of %r via private method.\n" % obj) continue count += 1 if count <= skip: continue data, missing = proc.getData(obj) prepareData(data) if not missing: value = data.get(key, None) if value is not None: updates[value] = (boost_values(obj, data), data) processed += 1 cpi.next() else: log("missing data, skipping indexing of %r.\n" % obj) checkPoint() conn.commit() log("solr index rebuilt.\n") msg = "processed %d items in %s (%s cpu time)." msg = msg % (processed, real.next(), cpu.next()) log(msg) logger.info(msg)
def cleanup(self, batch=1000): """ remove entries from solr that don't have a corresponding Zope object or have a different UID than the real object""" manager = queryUtility(ISolrConnectionManager) proc = SolrIndexProcessor(manager) conn = manager.getConnection() log = self.mklog(use_std_log=True) log('cleaning up solr index...\n') key = manager.getSchema().uniqueKey start = 0 resp = SolrResponse(conn.search(q='*:*', rows=batch, start=start)) res = resp.results() log('%s items in solr catalog\n' % resp.response.numFound) deleted = 0 reindexed = 0 while len(res) > 0: for flare in res: try: ob = PloneFlare(flare).getObject() except Exception as err: log('Error getting object, removing: %s (%s)\n' % ( flare['path_string'], err)) conn.delete(flare[key]) deleted += 1 continue if not IUUIDAware.providedBy(ob): log('Object %s of type %s does not support uuids, skipping.\n' % ('/'.join(ob.getPhysicalPath()), ob.meta_type)) continue uuid = IUUID(ob) if uuid != flare[key]: log('indexed under wrong UID, removing: %s\n' % flare['path_string']) conn.delete(flare[key]) deleted += 1 realob_res = SolrResponse(conn.search(q='%s:%s' % (key, uuid))).results() if len(realob_res) == 0: log('no sane entry for last object, reindexing\n') data, missing = proc.getData(ob) prepareData(data) if not missing: boost = boost_values(ob, data) conn.add(boost_values=boost, **data) reindexed += 1 else: log(' missing data, cannot index.\n') log('handled batch of %d items, commiting\n' % len(res)) conn.commit() start += batch resp = SolrResponse(conn.search(q='*:*', rows=batch, start=start)) res = resp.results() msg = 'solr cleanup finished, %s item(s) removed, %s item(s) reindexed\n' % (deleted, reindexed) log(msg) logger.info(msg)
def reindex(self, batch=1000, skip=0, limit=0): """ find all contentish objects (meaning all objects derived from one of the catalog mixin classes) and (re)indexes them """ manager = queryUtility(ISolrConnectionManager) proc = SolrIndexProcessor(manager) conn = manager.getConnection() zodb_conn = self.context._p_jar log = self.mklog() log('reindexing solr catalog...\n') if skip: log('skipping indexing of %d object(s)...\n' % skip) if limit: log('limiting indexing to %d object(s)...\n' % limit) real = timer() # real time lap = timer() # real lap time (for intermediate commits) cpu = timer(clock) # cpu time processed = 0 schema = manager.getSchema() key = schema.uniqueKey updates = {} # list to hold data to be updated flush = lambda: conn.flush() flush = notimeout(flush) def checkPoint(): for boost_values, data in updates.values(): adder = data.pop('_solr_adder') adder(conn, boost_values=boost_values, **data) updates.clear() msg = 'intermediate commit (%d items processed, ' \ 'last batch in %s)...\n' % (processed, lap.next()) log(msg) logger.info(msg) flush() zodb_conn.cacheGC() cpi = checkpointIterator(checkPoint, batch) count = 0 for path, obj in findObjects(self.context): if ICheckIndexable(obj)(): count += 1 if count <= skip: continue data, missing = proc.getData(obj) prepareData(data) if not missing: value = data.get(key, None) if value is not None: log('indexing %r\n' % obj) pt = data.get('portal_type', 'default') adder = queryAdapter(obj, ISolrAddHandler, name=pt) if adder is None: adder = DefaultAdder(obj) data['_solr_adder'] = adder updates[value] = (boost_values(obj, data), data) processed += 1 cpi.next() else: log('missing data, skipping indexing of %r.\n' % obj) if limit and count >= (skip + limit): break checkPoint() conn.commit() log('solr index rebuilt.\n') msg = 'processed %d items in %s (%s cpu time).' msg = msg % (processed, real.next(), cpu.next()) log(msg) logger.info(msg)
def cleanup(self, batch=1000): """remove entries from solr that don't have a corresponding Zope object or have a different UID than the real object""" manager = queryUtility(ISolrConnectionManager) proc = SolrIndexProcessor(manager) conn = manager.getConnection() log = self.mklog(use_std_log=True) log("cleaning up solr index...\n") key = manager.getSchema().uniqueKey start = 0 resp = SolrResponse(conn.search(q="*:*", rows=batch, start=start)) res = resp.results() log("%s items in solr catalog\n" % resp.response.numFound) deleted = 0 reindexed = 0 while len(res) > 0: for flare in res: try: ob = PloneFlare(flare).getObject() except Exception as err: log("Error getting object, removing: %s (%s)\n" % (flare["path_string"], err)) conn.delete(flare[key]) deleted += 1 continue if ob is None: log("Object not found, removing: %s\n" % (flare["path_string"])) conn.delete(flare[key]) deleted += 1 continue if not IUUIDAware.providedBy(ob): no_skipping_msg = ("Object %s of type %s does not " + "support uuids, skipping.\n") log(no_skipping_msg % ("/".join(ob.getPhysicalPath()), ob.meta_type)) continue uuid = IUUID(ob) if uuid != flare[key]: log("indexed under wrong UID, removing: %s\n" % flare["path_string"]) conn.delete(flare[key]) deleted += 1 realob_res = SolrResponse( conn.search(q="%s:%s" % (key, uuid))).results() if len(realob_res) == 0: log("no sane entry for last object, reindexing\n") data, missing = proc.getData(ob) prepareData(data) if not missing: boost = boost_values(ob, data) conn.add(boost_values=boost, **data) reindexed += 1 else: log(" missing data, cannot index.\n") log("handled batch of %d items, committing\n" % len(res)) conn.commit() start += batch resp = SolrResponse(conn.search(q="*:*", rows=batch, start=start)) res = resp.results() finished_msg = ("solr cleanup finished, %s item(s) removed, " + "%s item(s) reindexed\n") msg = finished_msg % (deleted, reindexed) log(msg) logger.info(msg)
def reindex( self, batch=1000, skip=0, limit=0, ignore_portal_types=None, only_portal_types=None, idxs=[], ignore_exceptions=False, ): """find all contentish objects (meaning all objects derived from one of the catalog mixin classes) and (re)indexes them""" if ignore_portal_types and only_portal_types: raise ValueError("It is not possible to combine " "ignore_portal_types with only_portal_types") atomic = idxs != [] manager = queryUtility(ISolrConnectionManager) proc = SolrIndexProcessor(manager) conn = manager.getConnection() zodb_conn = self.context._p_jar log = self.mklog() log("reindexing solr catalog...\n") if skip: log("skipping indexing of %d object(s)...\n" % skip) if limit: log("limiting indexing to %d object(s)...\n" % limit) real = timer() # real time lap = timer() # real lap time (for intermediate commits) cpu = timer(process_time) # cpu time processed = 0 schema = manager.getSchema() key = schema.uniqueKey updates = {} # list to hold data to be updated def flush(): return conn.commit(soft=True) flush = notimeout(flush) def checkPoint(): for my_boost_values, data in updates.values(): adder = data.pop("_solr_adder") try: adder(conn, boost_values=my_boost_values, **data) except Exception as e: logger.warning("Error %s @ %s", e, data["path_string"]) if not ignore_exceptions: raise updates.clear() msg = ("intermediate commit (%d items processed, " "last batch in %s)...\n" % (processed, next(lap))) log(msg) logger.info(msg) flush() zodb_conn.cacheGC() cpi = checkpointIterator(checkPoint, batch) count = 0 if atomic: log("indexing only {0} \n".format(idxs)) for path, obj in findObjects(self.context): if ICheckIndexable(obj)(): if getOwnIndexMethod: if getOwnIndexMethod(obj, "indexObject") is not None: log("skipping indexing of %r via private method.\n" % obj) continue count += 1 if count <= skip: continue if ignore_portal_types: if obj.portal_type in ignore_portal_types: continue if only_portal_types: if obj.portal_type not in only_portal_types: continue attributes = None if atomic: attributes = idxs # For atomic updates to work the uniqueKey must be present # in *every* update operation. if attributes and key not in attributes: attributes.append(key) data, missing = proc.getData(obj, attributes=attributes) prepareData(data) if not missing or atomic: value = data.get(key, None) if value is not None: log("indexing %r\n" % obj) pt = data.get("portal_type", "default") adder = queryAdapter(obj, ISolrAddHandler, name=pt) if adder is None: adder = DefaultAdder(obj) data["_solr_adder"] = adder updates[value] = (boost_values(obj, data), data) processed += 1 next(cpi) else: log("missing data, skipping indexing of %r.\n" % obj) if limit and count >= (skip + limit): break checkPoint() conn.commit() log("solr index rebuilt.\n") msg = "processed %d items in %s (%s cpu time)." msg = msg % (processed, next(real), next(cpu)) log(msg) logger.info(msg)
def reindex(self, batch=1000, skip=0, limit=0, ignore_portal_types=None, only_portal_types=None): """ find all contentish objects (meaning all objects derived from one of the catalog mixin classes) and (re)indexes them """ if ignore_portal_types and only_portal_types: raise ValueError("It is not possible to combine " "ignore_portal_types with only_portal_types") manager = queryUtility(ISolrConnectionManager) proc = SolrIndexProcessor(manager) conn = manager.getConnection() zodb_conn = self.context._p_jar log = self.mklog() log('reindexing solr catalog...\n') if skip: log('skipping indexing of %d object(s)...\n' % skip) if limit: log('limiting indexing to %d object(s)...\n' % limit) real = timer() # real time lap = timer() # real lap time (for intermediate commits) cpu = timer(clock) # cpu time processed = 0 schema = manager.getSchema() key = schema.uniqueKey updates = {} # list to hold data to be updated flush = lambda: conn.flush() flush = notimeout(flush) def checkPoint(): for my_boost_values, data in updates.values(): adder = data.pop('_solr_adder') adder(conn, boost_values=my_boost_values, **data) updates.clear() msg = 'intermediate commit (%d items processed, ' \ 'last batch in %s)...\n' % (processed, lap.next()) log(msg) logger.info(msg) flush() zodb_conn.cacheGC() cpi = checkpointIterator(checkPoint, batch) count = 0 for path, obj in findObjects(self.context): if ICheckIndexable(obj)(): count += 1 if count <= skip: continue if ignore_portal_types: if obj.portal_type in ignore_portal_types: continue if only_portal_types: if obj.portal_type not in only_portal_types: continue data, missing = proc.getData(obj) prepareData(data) if not missing: value = data.get(key, None) if value is not None: log('indexing %r\n' % obj) pt = data.get('portal_type', 'default') adder = queryAdapter(obj, ISolrAddHandler, name=pt) if adder is None: adder = DefaultAdder(obj) data['_solr_adder'] = adder updates[value] = (boost_values(obj, data), data) processed += 1 cpi.next() else: log('missing data, skipping indexing of %r.\n' % obj) if limit and count >= (skip + limit): break checkPoint() conn.commit() log('solr index rebuilt.\n') msg = 'processed %d items in %s (%s cpu time).' msg = msg % (processed, real.next(), cpu.next()) log(msg) logger.info(msg)