示例#1
0
def index_items(documents, object_type, **kwargs):
    """
    Index items based on list of item ids

    Args:
        documents (iterable of dict): An iterable with ElasticSearch documents to index
        object_type (str): the ES object type
    """
    conn = get_conn()
    # bulk will also break an iterable into chunks. However we should do this here so that
    # we can use the same documents when indexing to multiple aliases.
    for chunk in chunks(documents,
                        chunk_size=settings.ELASTICSEARCH_INDEXING_CHUNK_SIZE):
        for alias in get_active_aliases(conn, [object_type]):
            _, errors = bulk(
                conn,
                chunk,
                index=alias,
                doc_type=GLOBAL_DOC_TYPE,
                chunk_size=settings.ELASTICSEARCH_INDEXING_CHUNK_SIZE,
                **kwargs,
            )
            if len(errors) > 0:
                raise ReindexException(
                    f"Error during bulk {object_type} insert: {errors}")
示例#2
0
def populate_subscriptions_and_roles(self):
    """Populate channel roles and subscriptions for all users and channels"""
    results = celery.group([
        populate_user_subscriptions.si(ids) for ids in chunks(
            User.objects.exclude(
                username=settings.INDEXING_API_USERNAME).exclude(
                    profile__isnull=True).order_by("id").values_list(
                        "id", flat=True),
            chunk_size=settings.ELASTICSEARCH_INDEXING_CHUNK_SIZE,
        )
    ] + [
        populate_user_roles.si(ids) for ids in chunks(
            Channel.objects.order_by("id").values_list("id", flat=True),
            chunk_size=settings.ELASTICSEARCH_INDEXING_CHUNK_SIZE,
        )
    ])
    raise self.replace(results)
示例#3
0
def test_chunks():
    """
    test for chunks
    """
    input_list = list(range(113))
    output_list = []
    for nums in chunks(input_list):
        output_list += nums
    assert output_list == input_list

    output_list = []
    for nums in chunks(input_list, chunk_size=1):
        output_list += nums
    assert output_list == input_list

    output_list = []
    for nums in chunks(input_list, chunk_size=124):
        output_list += nums
    assert output_list == input_list
示例#4
0
def populate_channel_fields(self):
    """
    Populates Channel fields from reddit for all channels
    """
    results = celery.group([
        populate_channel_fields_batch.si(ids) for ids in chunks(
            Channel.objects.values_list("id", flat=True),
            chunk_size=settings.ELASTICSEARCH_INDEXING_CHUNK_SIZE,
        )
    ])
    raise self.replace(results)
示例#5
0
def populate_post_and_comment_fields(self):
    """
    Populates Post fields
    """
    results = celery.group([
        populate_post_and_comment_fields_batch.si(ids) for ids in chunks(
            Post.objects.order_by("id").values_list("id", flat=True),
            chunk_size=settings.ELASTICSEARCH_INDEXING_CHUNK_SIZE,
        )
    ])
    raise self.replace(results)
示例#6
0
def send_unsent_email_notifications():
    """
    Send all notifications that haven't been sent yet
    """
    for notification_ids in chunks(
            EmailNotification.objects.filter(
                state=EmailNotification.STATE_PENDING).values_list("id",
                                                                   flat=True),
            chunk_size=settings.NOTIFICATION_SEND_CHUNK_SIZE,
    ):
        EmailNotification.objects.filter(id__in=notification_ids).update(
            state=EmailNotification.STATE_SENDING)
        tasks.send_email_notification_batch.delay(notification_ids)
示例#7
0
def test_chunks_iterable():
    """
    test that chunks works on non-list iterables too
    """
    count = 113
    input_range = range(count)
    chunk_output = []
    for chunk in chunks(input_range, chunk_size=10):
        chunk_output.append(chunk)
    assert len(chunk_output) == ceil(113 / 10)

    range_list = []
    for chunk in chunk_output:
        range_list += chunk
    assert range_list == list(range(count))
示例#8
0
def _gen_attempt_send_notification_batches(notification_settings):
    """
    Generates the set of attempt_send_notification_batch tasks in a fan-out structure

    Args:
        notification_settings (iterable of NotificationSettings): an iterable of NotificationSettings to attempt the sends for

    Returns:
        celery.group: the celery group of tasks to execute
    """
    return celery.group([
        attempt_send_notification_batch.si(notification_settings_ids)
        for notification_settings_ids in chunks(
            notification_settings,
            chunk_size=settings.NOTIFICATION_ATTEMPT_CHUNK_SIZE,
        )
    ])
def test_bulk_index_content_files(mocked_es, mocker, settings, errors,
                                  indexing_func_name, doc):  # pylint: disable=too-many-arguments
    """
    index functions for content files should call bulk with correct arguments
    """
    settings.ELASTICSEARCH_INDEXING_CHUNK_SIZE = 3
    course = CourseFactory.create()
    run = LearningResourceRunFactory.create(content_object=course)
    content_files = ContentFileFactory.create_batch(5, run=run)
    mock_get_aliases = mocker.patch("search.indexing_api.get_active_aliases",
                                    autospec=True,
                                    return_value=["a", "b"])
    bulk_mock = mocker.patch("search.indexing_api.bulk",
                             autospec=True,
                             return_value=(0, errors))
    mocker.patch(
        f"search.indexing_api.serialize_content_file_for_bulk",
        autospec=True,
        return_value=doc,
    )
    mocker.patch(
        f"search.indexing_api.serialize_content_file_for_bulk_deletion",
        autospec=True,
        return_value=doc,
    )

    index_func = getattr(indexing_api, indexing_func_name)
    if errors:
        with pytest.raises(ReindexException):
            index_func(run.id)
    else:
        index_func(run.id)
        for alias in mock_get_aliases.return_value:
            for chunk in chunks(
                [doc for _ in content_files],
                    chunk_size=settings.ELASTICSEARCH_INDEXING_CHUNK_SIZE,
            ):
                bulk_mock.assert_any_call(
                    mocked_es.conn,
                    chunk,
                    index=alias,
                    doc_type=GLOBAL_DOC_TYPE,
                    chunk_size=settings.ELASTICSEARCH_INDEXING_CHUNK_SIZE,
                    routing=gen_course_id(course.platform, course.course_id),
                )
示例#10
0
def populate_all_posts_and_comments(self):
    """
    Backpopulate all posts and comments
    """
    reddit_api = get_admin_api().reddit

    # fetch and base36 decode the latest post id
    newest_post_id = base36.loads(next(reddit_api.front.new()).id)

    # create a celery chord by batching a backpopulate and merging results
    results = (celery.group(
        populate_posts_and_comments.si(post_ids) for post_ids in chunks(
            range(newest_post_id + 1),
            chunk_size=settings.ELASTICSEARCH_INDEXING_CHUNK_SIZE,
        ))
               | populate_posts_and_comments_merge_results.s())

    raise self.replace(results)
示例#11
0
def subscribe_all_users_to_channels(self, *, channel_names):
    """
    Subscribes all users to a set of channels

    Args:
        channel_names (list of str): the names of the channels to subscribe to
    """
    chunk_size = settings.OPEN_DISCUSSIONS_DEFAULT_CHANNEL_BACKPOPULATE_BATCH_SIZE
    query = (User.objects.exclude(username=settings.INDEXING_API_USERNAME).
             order_by("username").values_list("username",
                                              flat=True).iterator())

    results = celery.group([
        subscribe_user_range_to_channels.si(channel_names=channel_names,
                                            usernames=usernames)
        for usernames in chunks(query, chunk_size=chunk_size)
    ])

    raise self.replace(results)
def test_index_functions(mocked_es, mocker, settings, errors,
                         indexing_func_name, serializing_func_name):  # pylint: disable=too-many-arguments
    """
    index functions should call bulk with correct arguments
    """
    settings.ELASTICSEARCH_INDEXING_CHUNK_SIZE = 3
    documents = ["doc1", "doc2", "doc3", "doc4", "doc5"]
    mock_get_aliases = mocker.patch("search.indexing_api.get_active_aliases",
                                    autospec=True,
                                    return_value=["a", "b"])
    mocker.patch(
        f"search.indexing_api.{serializing_func_name}",
        autospec=True,
        return_value=(doc for doc in documents),
    )
    bulk_mock = mocker.patch("search.indexing_api.bulk",
                             autospec=True,
                             return_value=(0, errors))
    index_func = getattr(indexing_api, indexing_func_name)

    if errors:
        with pytest.raises(ReindexException):
            index_func([1, 2, 3])
    else:
        index_func([1, 2, 3])
        for alias in mock_get_aliases.return_value:
            for chunk in chunks(
                    documents,
                    chunk_size=settings.ELASTICSEARCH_INDEXING_CHUNK_SIZE):
                bulk_mock.assert_any_call(
                    mocked_es.conn,
                    chunk,
                    index=alias,
                    doc_type=GLOBAL_DOC_TYPE,
                    chunk_size=settings.ELASTICSEARCH_INDEXING_CHUNK_SIZE,
                )
示例#13
0
def start_recreate_index(self):
    """
    Wipe and recreate index and mapping, and index all items.
    """
    try:
        new_backing_indices = {
            obj_type: api.create_backing_index(obj_type)
            for obj_type in VALID_OBJECT_TYPES
        }

        # Do the indexing on the temp index
        log.info(
            "starting to index all posts, comments, profiles, and course catalog objects..."
        )

        blacklisted_ids = load_course_blacklist()

        index_tasks = celery.group([
            index_posts.si(post_ids) for post_ids in chunks(
                Post.objects.order_by("id").values_list("id", flat=True),
                chunk_size=settings.ELASTICSEARCH_INDEXING_CHUNK_SIZE,
            )
        ] + [
            index_comments.si(comment_ids) for comment_ids in chunks(
                Comment.objects.order_by("id").values_list("id", flat=True),
                chunk_size=settings.ELASTICSEARCH_INDEXING_CHUNK_SIZE,
            )
        ] + [
            index_profiles.si(ids) for ids in chunks(
                User.objects.exclude(
                    username=settings.INDEXING_API_USERNAME).exclude(
                        profile__isnull=True).filter(is_active=True).order_by(
                            "id").values_list("profile__id", flat=True),
                chunk_size=settings.ELASTICSEARCH_INDEXING_CHUNK_SIZE,
            )
        ] + [
            index_courses.si(ids) for ids in chunks(
                Course.objects.filter(published=True).exclude(
                    course_id__in=blacklisted_ids).order_by("id").values_list(
                        "id", flat=True),
                chunk_size=settings.ELASTICSEARCH_INDEXING_CHUNK_SIZE,
            )
        ] + [
            index_course_content_files.si(ids) for ids in chunks(
                Course.objects.filter(published=True).filter(platform__in=(
                    PlatformType.ocw.value, PlatformType.xpro.value)).exclude(
                        course_id__in=blacklisted_ids).order_by(
                            "id").values_list("id", flat=True),
                chunk_size=settings.ELASTICSEARCH_INDEXING_CHUNK_SIZE,
            )
        ] + [
            index_bootcamps.si(ids) for ids in chunks(
                Bootcamp.objects.filter(published=True).order_by(
                    "id").values_list("id", flat=True),
                chunk_size=settings.ELASTICSEARCH_INDEXING_CHUNK_SIZE,
            )
        ] + [
            index_programs.si(ids) for ids in chunks(
                Program.objects.filter(published=True).order_by(
                    "id").values_list("id", flat=True),
                chunk_size=settings.ELASTICSEARCH_INDEXING_CHUNK_SIZE,
            )
        ] + [
            index_user_lists.si(ids) for ids in chunks(
                UserList.objects.order_by("id").exclude(
                    items=None).values_list("id", flat=True),
                chunk_size=settings.ELASTICSEARCH_INDEXING_CHUNK_SIZE,
            )
        ] + [
            index_videos.si(ids) for ids in chunks(
                Video.objects.filter(published=True).order_by(
                    "id").values_list("id", flat=True),
                chunk_size=settings.ELASTICSEARCH_INDEXING_CHUNK_SIZE,
            )
        ])

    except:  # pylint: disable=bare-except
        error = "start_recreate_index threw an error"
        log.exception(error)
        return error

    # Use self.replace so that code waiting on this task will also wait on the indexing and finish tasks
    raise self.replace(
        celery.chain(index_tasks,
                     finish_recreate_index.s(new_backing_indices)))