Exemplo n.º 1
0
    def index(self, obj, attributes=None):
        """Index the specified attributes for obj using atomic updates, or all
        of them if `attributes` is `None`.
        Also make sure the `uniqueKey` is part of attributes, and passing the
        attributes to the self.getData() call to avoid causing Plone to index
        all fields instead of just the necessary ones.
        """
        conn = self.getConnection()
        if conn is not None and ICheckIndexable(obj)():
            schema = self.manager.getSchema()
            if schema is None:
                msg = 'unable to fetch schema, skipping indexing of %r'
                logger.warning(msg, obj)
                return
            uniqueKey = schema.get('uniqueKey', None)
            if uniqueKey is None:
                msg = 'schema is missing unique key, skipping indexing of %r'
                logger.warning(msg, obj)
                return

            if attributes is not None:

                if 'path' in attributes:
                    attributes = list(attributes)
                    attributes.extend(
                        ['path_string', 'path_parents', 'path_depth'])

                attributes = set(schema.keys()).intersection(attributes)
                if not attributes:
                    return

                if uniqueKey not in attributes:
                    # The uniqueKey is required in order to identify the
                    # document when doing atomic updates.
                    attributes.add(uniqueKey)

            data, missing = self.getData(obj, attributes=attributes)
            if not data:
                return  # don't index with no data...
            prepareData(data)
            if data.get(uniqueKey, None) is not None and not missing:
                registry = getUtility(IRegistry)
                config_commit_within = registry[
                    'collective.solr.commit_within']  # noqa
                if config_commit_within:
                    data['commitWithin'] = config_commit_within
                try:
                    logger.debug('indexing %r (%r)', obj, data)
                    pt = data.get('portal_type', 'default')
                    logger.debug('indexing %r with %r adder (%r)', obj, pt,
                                 data)

                    adder = queryAdapter(obj, ISolrAddHandler, name=pt)

                    if adder is None:
                        adder = DefaultAdder(obj)
                    adder(conn, boost_values=boost_values(obj, data), **data)
                except (SolrConnectionException, error):
                    logger.exception('exception during indexing %r', obj)
Exemplo n.º 2
0
    def index(self, obj, attributes=None):
        conn = self.getConnection()
        if conn is not None and ICheckIndexable(obj)():
            # unfortunately with current versions of solr we need to provide
            # data for _all_ fields during an <add> -- partial updates aren't
            # supported (see https://issues.apache.org/jira/browse/SOLR-139)
            # however, the reindexing can be skipped if none of the given
            # attributes match existing solr indexes...
            schema = self.manager.getSchema()
            if schema is None:
                msg = 'unable to fetch schema, skipping indexing of %r'
                logger.warning(msg, obj)
                return
            uniqueKey = schema.get('uniqueKey', None)
            if uniqueKey is None:
                msg = 'schema is missing unique key, skipping indexing of %r'
                logger.warning(msg, obj)
                return
            if attributes is not None:
                attributes = set(schema.keys()).intersection(attributes)
                if not attributes:
                    return
            data, missing = self.getData(obj)
            if not data:
                return  # don't index with no data...
            prepareData(data)
            if data.get(uniqueKey, None) is not None and not missing:
                config = getUtility(ISolrConnectionConfig)
                if config.commit_within:
                    data['commitWithin'] = config.commit_within
                try:
                    logger.debug('indexing %r (%r)', obj, data)
                    pt = data.get('portal_type', 'default')
                    logger.debug('indexing %r with %r adder (%r)', obj, pt,
                                 data)

                    adder = queryAdapter(obj, ISolrAddHandler, name=pt)

                    if adder is None:
                        adder = DefaultAdder(obj)
                    adder(conn, boost_values=boost_values(obj, data), **data)
                except (SolrException, error):
                    logger.exception('exception during indexing %r', obj)
Exemplo n.º 3
0
    def sync(self, batch=1000, preImportDeleteQuery="*:*"):
        """Sync the Solr index with the portal catalog. Records contained
        in the catalog but not in Solr will be indexed and records not
        contained in the catalog will be removed.
        """
        manager = queryUtility(ISolrConnectionManager)
        proc = SolrIndexProcessor(manager)
        conn = manager.getConnection()
        key = queryUtility(ISolrConnectionManager).getSchema().uniqueKey
        zodb_conn = self.context._p_jar
        catalog = getToolByName(self.context, "portal_catalog")
        getIndex = catalog._catalog.getIndex
        modified_index = getIndex("modified")
        uid_index = getIndex(key)
        log = self.mklog()
        real = timer()  # real time
        lap = timer()  # real lap time (for intermediate commits)
        cpu = timer(process_time)  # cpu time
        # get Solr status
        response = conn.search(q=preImportDeleteQuery,
                               rows=MAX_ROWS,
                               fl="%s modified" % key)
        # avoid creating DateTime instances
        simple_unmarshallers = unmarshallers.copy()
        simple_unmarshallers["date"] = parse_date_as_datetime
        flares = SolrResponse(response, simple_unmarshallers)
        response.close()
        solr_results = {}
        solr_uids = set()

        def _utc_convert(value):
            t_tup = value.utctimetuple()
            return (((t_tup[0] * 12 + t_tup[1]) * 31 + t_tup[2]) * 24 +
                    t_tup[3]) * 60 + t_tup[4]

        for flare in flares:
            uid = flare[key]
            solr_uids.add(uid)
            solr_results[uid] = _utc_convert(flare["modified"])
        # get catalog status
        cat_results = {}
        cat_uids = set()
        for uid, rid in uid_index._index.items():
            cat_uids.add(uid)
            cat_results[uid] = rid
        # differences
        index = cat_uids.difference(solr_uids)
        solr_uids.difference_update(cat_uids)
        unindex = solr_uids
        processed = 0
        flush = notimeout(lambda: conn.flush())

        def checkPoint():
            msg = ("intermediate commit (%d items processed, "
                   "last batch in %s)...\n" % (processed, next(lap)))
            log(msg)
            logger.info(msg)
            flush()
            zodb_conn.cacheGC()

        cpi = checkpointIterator(checkPoint, batch)
        # Look up objects
        uid_rid_get = cat_results.get
        rid_path_get = catalog._catalog.paths.get
        catalog_traverse = catalog.unrestrictedTraverse

        def lookup(
            uid,
            rid=None,
            uid_rid_get=uid_rid_get,
            rid_path_get=rid_path_get,
            catalog_traverse=catalog_traverse,
        ):
            if rid is None:
                rid = uid_rid_get(uid)
            if not rid:
                return None
            if not isinstance(rid, int):
                rid = tuple(rid)[0]
            path = rid_path_get(rid)
            if not path:
                return None
            try:
                obj = catalog_traverse(path)
            except AttributeError:
                return None
            return obj

        log('processing %d "unindex" operations next...\n' % len(unindex))
        op = notimeout(lambda uid: conn.delete(id=uid))
        for uid in unindex:
            obj = lookup(uid)
            if obj is None:
                op(uid)
                processed += 1
                next(cpi)
            else:
                log("not unindexing existing object %r.\n" % uid)
        log('processing %d "index" operations next...\n' % len(index))
        op = notimeout(lambda obj: proc.index(obj))
        for uid in index:
            obj = lookup(uid)
            if ICheckIndexable(obj)():
                op(obj)
                processed += 1
                next(cpi)
            else:
                log("not indexing unindexable object %r.\n" % uid)
            if obj is not None:
                obj._p_deactivate()
        log('processing "reindex" operations next...\n')
        op = notimeout(lambda obj: proc.reindex(obj))
        cat_mod_get = modified_index._unindex.get
        solr_mod_get = solr_results.get
        done = unindex.union(index)
        for uid, rid in cat_results.items():
            if uid in done:
                continue
            if isinstance(rid, IITreeSet):
                rid = list(rid.keys())[0]
            if cat_mod_get(rid) != solr_mod_get(uid):
                obj = lookup(uid, rid=rid)
                if ICheckIndexable(obj)():
                    op(obj)
                    processed += 1
                    next(cpi)
                else:
                    log("not reindexing unindexable object %r.\n" % uid)
                if obj is not None:
                    obj._p_deactivate()
        conn.commit()
        log("solr index synced.\n")
        msg = "processed %d object(s) in %s (%s cpu time)."
        msg = msg % (processed, next(real), next(cpu))
        log(msg)
        logger.info(msg)
Exemplo n.º 4
0
    def reindex(
        self,
        batch=1000,
        skip=0,
        limit=0,
        ignore_portal_types=None,
        only_portal_types=None,
        idxs=[],
        ignore_exceptions=False,
    ):
        """find all contentish objects (meaning all objects derived from one
        of the catalog mixin classes) and (re)indexes them"""

        if ignore_portal_types and only_portal_types:
            raise ValueError("It is not possible to combine "
                             "ignore_portal_types with only_portal_types")

        atomic = idxs != []
        manager = queryUtility(ISolrConnectionManager)
        proc = SolrIndexProcessor(manager)
        conn = manager.getConnection()
        zodb_conn = self.context._p_jar
        log = self.mklog()
        log("reindexing solr catalog...\n")
        if skip:
            log("skipping indexing of %d object(s)...\n" % skip)
        if limit:
            log("limiting indexing to %d object(s)...\n" % limit)
        real = timer()  # real time
        lap = timer()  # real lap time (for intermediate commits)
        cpu = timer(process_time)  # cpu time
        processed = 0
        schema = manager.getSchema()
        key = schema.uniqueKey
        updates = {}  # list to hold data to be updated

        def flush():
            return conn.commit(soft=True)

        flush = notimeout(flush)

        def checkPoint():
            for my_boost_values, data in updates.values():
                adder = data.pop("_solr_adder")
                try:
                    adder(conn, boost_values=my_boost_values, **data)
                except Exception as e:
                    logger.warning("Error %s @ %s", e, data["path_string"])
                    if not ignore_exceptions:
                        raise
            updates.clear()
            msg = ("intermediate commit (%d items processed, "
                   "last batch in %s)...\n" % (processed, next(lap)))
            log(msg)
            logger.info(msg)
            flush()
            zodb_conn.cacheGC()

        cpi = checkpointIterator(checkPoint, batch)
        count = 0

        if atomic:
            log("indexing only {0} \n".format(idxs))

        for path, obj in findObjects(self.context):
            if ICheckIndexable(obj)():
                if getOwnIndexMethod:
                    if getOwnIndexMethod(obj, "indexObject") is not None:
                        log("skipping indexing of %r via private method.\n" %
                            obj)
                        continue

                count += 1
                if count <= skip:
                    continue

                if ignore_portal_types:
                    if obj.portal_type in ignore_portal_types:
                        continue

                if only_portal_types:
                    if obj.portal_type not in only_portal_types:
                        continue

                attributes = None
                if atomic:
                    attributes = idxs

                # For atomic updates to work the uniqueKey must be present
                # in *every* update operation.
                if attributes and key not in attributes:
                    attributes.append(key)
                data, missing = proc.getData(obj, attributes=attributes)
                prepareData(data)

                if not missing or atomic:
                    value = data.get(key, None)
                    if value is not None:
                        log("indexing %r\n" % obj)

                        pt = data.get("portal_type", "default")
                        adder = queryAdapter(obj, ISolrAddHandler, name=pt)
                        if adder is None:
                            adder = DefaultAdder(obj)
                        data["_solr_adder"] = adder
                        updates[value] = (boost_values(obj, data), data)
                        processed += 1
                        next(cpi)
                else:
                    log("missing data, skipping indexing of %r.\n" % obj)
                if limit and count >= (skip + limit):
                    break

        checkPoint()
        conn.commit()
        log("solr index rebuilt.\n")
        msg = "processed %d items in %s (%s cpu time)."
        msg = msg % (processed, next(real), next(cpu))
        log(msg)
        logger.info(msg)
Exemplo n.º 5
0
class SolrMaintenanceView(BrowserView):
    """ helper view for indexing all portal content in Solr """
    def mklog(self, use_std_log=False):
        """ helper to prepend a time stamp to the output """
        write = self.request.RESPONSE.write

        def log(msg, timestamp=True):
            if timestamp:
                msg = strftime('%Y/%m/%d-%H:%M:%S ') + msg
            write(msg)
            if use_std_log:
                logger.info(msg)

        return log

    def optimize(self):
        """ optimize solr indexes """
        manager = queryUtility(ISolrConnectionManager)
        conn = manager.getConnection()
        conn.setTimeout(None)
        conn.commit(optimize=True)
        return 'solr indexes optimized.'

    def clear(self):
        """ clear all data from solr, i.e. delete all indexed objects """
        manager = queryUtility(ISolrConnectionManager)
        uniqueKey = manager.getSchema().uniqueKey
        conn = manager.getConnection()
        conn.setTimeout(None)
        conn.deleteByQuery('%s:[* TO *]' % uniqueKey)
        conn.commit()
        return 'solr index cleared.'

    def reindex(self,
                batch=1000,
                skip=0,
                limit=0,
                ignore_portal_types=None,
                only_portal_types=None,
                idxs=[],
                ignore_exceptions=False):
        """ find all contentish objects (meaning all objects derived from one
            of the catalog mixin classes) and (re)indexes them """

        if ignore_portal_types and only_portal_types:
            raise ValueError("It is not possible to combine "
                             "ignore_portal_types with only_portal_types")

        atomic = idxs != []
        manager = queryUtility(ISolrConnectionManager)
        proc = SolrIndexProcessor(manager)
        conn = manager.getConnection()
        zodb_conn = self.context._p_jar
        log = self.mklog()
        log('reindexing solr catalog...\n')
        if skip:
            log('skipping indexing of %d object(s)...\n' % skip)
        if limit:
            log('limiting indexing to %d object(s)...\n' % limit)
        real = timer()  # real time
        lap = timer()  # real lap time (for intermediate commits)
        cpu = timer(clock)  # cpu time
        processed = 0
        schema = manager.getSchema()
        key = schema.uniqueKey
        updates = {}  # list to hold data to be updated
        flush = lambda: conn.commit(soft=True)
        flush = notimeout(flush)

        def checkPoint():
            for my_boost_values, data in updates.values():
                adder = data.pop('_solr_adder')
                try:
                    adder(conn, boost_values=my_boost_values, **data)
                except Exception, e:
                    logger.warn('Error %s @ %s', e, data['path_string'])
                    if not ignore_exceptions:
                        raise
            updates.clear()
            msg = 'intermediate commit (%d items processed, ' \
                  'last batch in %s)...\n' % (processed, lap.next())
            log(msg)
            logger.info(msg)
            flush()
            zodb_conn.cacheGC()

        cpi = checkpointIterator(checkPoint, batch)
        count = 0

        if atomic:
            log('indexing only {0} \n'.format(idxs))

        for path, obj in findObjects(self.context):
            if ICheckIndexable(obj)():
                count += 1
                if count <= skip:
                    continue

                if ignore_portal_types:
                    if obj.portal_type in ignore_portal_types:
                        continue

                if only_portal_types:
                    if obj.portal_type not in only_portal_types:
                        continue

                attributes = None
                if atomic:
                    attributes = idxs

                # For atomic updates to work the uniqueKey must be present
                # in *every* update operation.
                if attributes and key not in attributes:
                    attributes.append(key)
                data, missing = proc.getData(obj, attributes=attributes)
                prepareData(data)

                if not missing or atomic:
                    value = data.get(key, None)
                    if value is not None:
                        log('indexing %r\n' % obj)

                        pt = data.get('portal_type', 'default')
                        adder = queryAdapter(obj, ISolrAddHandler, name=pt)
                        if adder is None:
                            adder = DefaultAdder(obj)
                        data['_solr_adder'] = adder
                        updates[value] = (boost_values(obj, data), data)
                        processed += 1
                        cpi.next()
                else:
                    log('missing data, skipping indexing of %r.\n' % obj)
                if limit and count >= (skip + limit):
                    break

        checkPoint()
        conn.commit()
        log('solr index rebuilt.\n')
        msg = 'processed %d items in %s (%s cpu time).'
        msg = msg % (processed, real.next(), cpu.next())
        log(msg)
        logger.info(msg)
Exemplo n.º 6
0
    def reindex(self, batch=1000, skip=0, limit=0, ignore_portal_types=None,
                only_portal_types=None):
        """ find all contentish objects (meaning all objects derived from one
            of the catalog mixin classes) and (re)indexes them """

        if ignore_portal_types and only_portal_types:
            raise ValueError("It is not possible to combine "
                             "ignore_portal_types with only_portal_types")

        manager = queryUtility(ISolrConnectionManager)
        proc = SolrIndexProcessor(manager)
        conn = manager.getConnection()
        zodb_conn = self.context._p_jar
        log = self.mklog()
        log('reindexing solr catalog...\n')
        if skip:
            log('skipping indexing of %d object(s)...\n' % skip)
        if limit:
            log('limiting indexing to %d object(s)...\n' % limit)
        real = timer()          # real time
        lap = timer()           # real lap time (for intermediate commits)
        cpu = timer(clock)      # cpu time
        processed = 0
        schema = manager.getSchema()
        key = schema.uniqueKey
        updates = {}            # list to hold data to be updated
        flush = lambda: conn.flush()
        flush = notimeout(flush)

        def checkPoint():
            for my_boost_values, data in updates.values():
                adder = data.pop('_solr_adder')
                adder(conn, boost_values=my_boost_values, **data)
            updates.clear()
            msg = 'intermediate commit (%d items processed, ' \
                  'last batch in %s)...\n' % (processed, lap.next())
            log(msg)
            logger.info(msg)
            flush()
            zodb_conn.cacheGC()
        cpi = checkpointIterator(checkPoint, batch)
        count = 0
        for path, obj in findObjects(self.context):
            if ICheckIndexable(obj)():
                count += 1
                if count <= skip:
                    continue

                if ignore_portal_types:
                    if obj.portal_type in ignore_portal_types:
                        continue

                if only_portal_types:
                    if obj.portal_type not in only_portal_types:
                        continue

                data, missing = proc.getData(obj)
                prepareData(data)
                if not missing:
                    value = data.get(key, None)
                    if value is not None:
                        log('indexing %r\n' % obj)
                        pt = data.get('portal_type', 'default')
                        adder = queryAdapter(obj, ISolrAddHandler, name=pt)
                        if adder is None:
                            adder = DefaultAdder(obj)
                        data['_solr_adder'] = adder
                        updates[value] = (boost_values(obj, data), data)
                        processed += 1
                        cpi.next()
                else:
                    log('missing data, skipping indexing of %r.\n' % obj)
                if limit and count >= (skip + limit):
                    break

        checkPoint()
        conn.commit()
        log('solr index rebuilt.\n')
        msg = 'processed %d items in %s (%s cpu time).'
        msg = msg % (processed, real.next(), cpu.next())
        log(msg)
        logger.info(msg)