示例#1
0
    def index_all(cls, percent=100):
        """Reindexes all the objects for this model.

        Yields number of documents done.

        Note: This can get run from the command line, so we log stuff
        to let the user know what's going on.

        :arg percent: The percentage of questions to index. Defaults to
            100--e.g. all of them.

        """
        es = es_utils.get_indexing_es()

        doc_type = cls._meta.db_table
        index = settings.ES_WRITE_INDEXES['default']

        start_time = time.time()

        indexable_qs = cls.get_indexable()

        log.info('reindex %s into %s index', doc_type, index)

        log.info('iterating through %s....', doc_type)
        total = indexable_qs.count()
        to_index = int(total * (percent / 100.0))
        log.info('total %s: %s (to be indexed: %s)', doc_type, total, to_index)
        if to_index == 0:
            log.info('done!')
            return

        total = to_index

        for t, obj_id in enumerate(indexable_qs):
            if t > total:
                break

            if t % 1000 == 0 and t > 0:
                time_to_go = (total - t) * ((time.time() - start_time) / t)
                per_1000 = (time.time() - start_time) / (t / 1000.0)
                log.info('%s/%s... (%s to go, %s per 1000 docs)', t, total,
                         es_utils.format_time(time_to_go),
                         es_utils.format_time(per_1000))

                # We call this every 1000 or so because we're
                # essentially loading the whole db and if DEBUG=True,
                # then Django saves every sql statement which causes
                # our memory to go up up up. So we reset it and that
                # makes things happier even in DEBUG environments.
                reset_queries()

            if t % settings.ES_FLUSH_BULK_EVERY == 0:
                # We built the ES with this setting, but it doesn't
                # actually do anything with it unless we call
                # flush_bulk which causes it to check its bulk_size
                # and flush it if it's too big.
                es.flush_bulk()

            try:
                cls.index(cls.extract_document(obj_id), bulk=True, es=es)
            except Exception:
                log.exception('Unable to extract/index document (id: %d)',
                              obj_id)

            yield t

        es.flush_bulk(forced=True)
        delta_time = time.time() - start_time
        log.info('done! (%s, %s per 1000 docs)',
                 es_utils.format_time(delta_time),
                 es_utils.format_time(delta_time / (total / 1000.0)))
        es.refresh()
示例#2
0
    def index_all(cls, percent=100):
        """Reindexes all the objects for this model.

        Yields number of documents done.

        Note: This can get run from the command line, so we log stuff
        to let the user know what's going on.

        :arg percent: The percentage of questions to index. Defaults to
            100--e.g. all of them.

        """
        es = es_utils.get_es()

        doc_type = cls._meta.db_table
        index = cls.get_es_index()

        if index != settings.ES_INDEXES.get('default'):
            # If this doctype isn't using the default index, then this
            # doctype is responsible for deleting and re-creating the
            # index.
            es.delete_index_if_exists(index)
            es.create_index(index)

        start_time = time.time()

        log.info('reindex %s into %s index', doc_type, index)

        log.info('setting up mapping....')
        mapping = cls.get_mapping()
        es.put_mapping(doc_type, mapping, index)

        log.info('iterating through %s....', doc_type)
        total = cls.objects.count()
        to_index = int(total * (percent / 100.0))
        log.info('total %s: %s (to be indexed: %s)', doc_type, total, to_index)
        total = to_index

        # Some models have a gazillion instances. So we want to go
        # through them one at a time in a way that doesn't pull all
        # the data into memory all at once. So we iterate through ids
        # and pull the objects one at a time.
        qs = cls.objects.order_by('id').values_list('id', flat=True)

        for t, obj_id in enumerate(qs.iterator()):
            if t > total:
                break

            obj = cls.objects.get(pk=obj_id)

            if t % 1000 == 0 and t > 0:
                time_to_go = (total - t) * ((time.time() - start_time) / t)
                log.info('%s/%s... (%s to go)', t, total,
                         es_utils.format_time(time_to_go))

                # We call this every 1000 or so because we're
                # essentially loading the whole db and if DEBUG=True,
                # then Django saves every sql statement which causes
                # our memory to go up up up. So we reset it and that
                # makes things happier even in DEBUG environments.
                reset_queries()

            if t % settings.ES_FLUSH_BULK_EVERY == 0:
                # We built the ES with this setting, but it doesn't
                # actually do anything with it unless we call
                # flush_bulk which causes it to check its bulk_size
                # and flush it if it's too big.
                es.flush_bulk()

            try:
                cls.index(obj.extract_document(), bulk=True, es=es)
            except Exception:
                log.exception('Unable to extract/index document (id: %d)',
                              obj.id)

            yield t

        es.flush_bulk(forced=True)
        end_time = time.time()
        log.info('done! (%s)', es_utils.format_time(end_time - start_time))
        es.refresh()