예제 #1
0
    def handle(self, *args, **kwargs):
        doc_types = get_doc_types()

        limit = kwargs["limit"]
        if limit:
            doc_types = [dt for dt in doc_types if dt.__name__ in limit]

        progress_msg = "Indexed {progress} out of {count}"

        for dt in doc_types:
            self.stdout.write("Reindexing: {}".format(dt.__name__))

            model = dt.get_model()

            before = kwargs["updated_before"]
            after = kwargs["updated_after"]
            if before or after:
                try:
                    qs = model.objects_range(before=before, after=after)
                except NotImplementedError:
                    print(
                        f"{model} hasn't implemeneted an `updated_column_name` property."
                        "No documents will be indexed of this type.")
                    continue
            else:
                qs = model._default_manager.all()

            total = qs.count()
            count = kwargs["count"]

            percentage = kwargs["percentage"]
            if count:
                print("Indexing {} documents out of {}".format(count, total))
            else:
                if percentage < 100:
                    count = int(total * percentage / 100)
                    qs = qs[:count]
                else:
                    count = total
                print("Indexing {}%, so {} documents out of {}".format(
                    percentage, count, total))

            id_list = list(qs.values_list("pk", flat=True))
            bulk_count = kwargs["bulk_count"]

            for x in range(ceil(count / bulk_count)):
                start = x * bulk_count
                end = start + bulk_count
                index_objects_bulk.delay(
                    dt.__name__,
                    id_list[start:end],
                    timeout=kwargs["timeout"],
                )
                if kwargs["print_sql_count"]:
                    print("{} SQL queries executed".format(
                        len(connection.queries)))
                    reset_queries()
                print(
                    progress_msg.format(progress=min(end, count), count=count))
예제 #2
0
def handle_question_vote_delete(instance, **kwargs):
    index_object.delay("QuestionDocument", instance.question_id)
    index_objects_bulk.delay(
        "AnswerDocument",
        list(instance.question.answers.values_list("pk", flat=True)))
예제 #3
0
def handle_question_save(instance, **kwargs):
    if not isinstance(instance, Question):
        return
    index_object.delay("QuestionDocument", instance.pk)
    index_objects_bulk.delay(
        "AnswerDocument", list(instance.answers.values_list("pk", flat=True)))
    def handle(self, **options):
        # Set up logging so it doesn't send Ricky email.
        logging.basicConfig(level=logging.ERROR)

        # Get a list of ids of questions we're going to go change. We need
        # a list of ids so that we can feed it to the update, but then
        # also know what we need to update in the index.
        days_180 = datetime.now() - timedelta(days=180)
        q_ids = list(
            Question.objects.filter(is_archived=False).filter(
                created__lte=days_180).values_list("id", flat=True))

        if q_ids:
            log.info("Updating %d questions", len(q_ids))

            sql = """
                UPDATE questions_question
                SET is_archived = 1
                WHERE id IN (%s)
                """ % ",".join(map(str, q_ids))

            cursor = connection.cursor()
            cursor.execute(sql)
            if not transaction.get_connection().in_atomic_block:
                transaction.commit()

            if settings.ES_LIVE_INDEXING:
                # elastic v7 code:
                answer_ids = list(
                    Answer.objects.filter(question_id__in=q_ids).values_list(
                        "id", flat=True))
                index_objects_bulk.delay("QuestionDocument", q_ids)
                index_objects_bulk.delay("AnswerDocument", answer_ids)

                # elastic v2 code:
                try:
                    # So... the first time this runs, it'll handle 160K
                    # questions or so which stresses everything. Thus we
                    # do it in chunks because otherwise this won't work.
                    #
                    # After we've done this for the first time, we can nix
                    # the chunking code.

                    from kitsune.search.utils import chunked

                    for chunk in chunked(q_ids, 100):

                        # Fetch all the documents we need to update.
                        es_docs = get_documents(QuestionMappingType, chunk)

                        log.info("Updating %d index documents", len(es_docs))

                        documents = []

                        # For each document, update the data and stick it
                        # back in the index.
                        for doc in es_docs:
                            doc["question_is_archived"] = True
                            doc["indexed_on"] = int(time.time())
                            documents.append(doc)

                        QuestionMappingType.bulk_index(documents)

                except ES_EXCEPTIONS:
                    # Something happened with ES, so let's push index
                    # updating into an index_task which retries when it
                    # fails because of ES issues.
                    index_task.delay(to_class_path(QuestionMappingType), q_ids)
예제 #5
0
def handle_forum_thread_save(instance, **kwargs):
    index_objects_bulk.delay(
        "ForumDocument", list(instance.post_set.values_list("pk", flat=True)))
예제 #6
0
    def handle(self, *args, **kwargs):
        doc_types = get_doc_types()

        limit = kwargs["limit"]
        if limit:
            doc_types = [dt for dt in doc_types if dt.__name__ in limit]

        progress_msg = "Indexed {progress} out of {count}"

        for dt in doc_types:
            self.stdout.write("Reindexing: {}".format(dt.__name__))

            model = dt.get_model()

            before = kwargs["updated_before"]
            after = kwargs["updated_after"]
            if before or after:
                try:
                    qs = model.objects_range(before=before, after=after)
                except NotImplementedError:
                    print(
                        f"{model} hasn't implemeneted an `updated_column_name` property."
                        "No documents will be indexed of this type.")
                    continue
            else:
                qs = model._default_manager.all()

            total = qs.count()
            count = kwargs["count"]

            percentage = kwargs["percentage"]
            if count:
                print("Indexing {} documents out of {}".format(count, total))
            else:
                if percentage < 100:
                    count = int(total * percentage / 100)
                    qs = qs[:count]
                else:
                    count = total
                print("Indexing {}%, so {} documents out of {}".format(
                    percentage, count, total))

            id_list = list(qs.values_list("pk", flat=True))
            sql_chunk_size = kwargs["sql_chunk_size"]

            # slice the list of ids into chunks of `sql_chunk_size` and send a task to celery
            # to process each chunk. we do this so as to not OOM on celery when processing
            # tens of thousands of documents
            for x in range(ceil(count / sql_chunk_size)):
                start = x * sql_chunk_size
                end = start + sql_chunk_size
                index_objects_bulk.delay(
                    dt.__name__,
                    id_list[start:end],
                    timeout=kwargs["timeout"],
                    # elastic_chunk_size determines how many documents get sent to elastic
                    # in each bulk request, the limiting factor here is the performance of
                    # our elastic cluster
                    elastic_chunk_size=kwargs["elastic_chunk_size"],
                )
                if kwargs["print_sql_count"]:
                    print("{} SQL queries executed".format(
                        len(connection.queries)))
                    reset_queries()
                print(
                    progress_msg.format(progress=min(end, count), count=count))