Пример #1
0
    def handle(self, *args, **options):
        """
        Initializes and configures the settings for an Algolia index, and then spins off
        a task for each batch of content_keys to reindex course data in Algolia.
        """
        # Initialize and configure the Algolia index
        get_initialized_algolia_client()

        # Retrieve indexable content_keys for all ContentMetadata records with a content type of "course"
        all_course_content_metadata = content_metadata_with_type_course()
        indexable_course_keys = get_indexable_course_keys(
            all_course_content_metadata)

        for content_keys_batch in batch(indexable_course_keys,
                                        batch_size=TASK_BATCH_SIZE):
            async_task = index_enterprise_catalog_courses_in_algolia_task.delay(
                content_keys=content_keys_batch,
                algolia_fields=ALGOLIA_FIELDS,
            )
            message = (
                'Spinning off task index_enterprise_catalog_courses_in_algolia_task (%s) from'
                ' the reindex_algolia command to reindex %d courses in Algolia.'
            )
            logger.info(message, async_task.task_id, len(content_keys_batch))

            # See https://docs.celeryproject.org/en/stable/reference/celery.result.html#celery.result.AsyncResult.get
            # for documentation
            async_task.get(timeout=TASK_TIMEOUT, propagate=True)
            if async_task.successful():
                message = (
                    'index_enterprise_catalog_courses_in_algolia_task (%s) from command reindex_algolia finished'
                    ' successfully.')
                logger.info(message, async_task.task_id)
Пример #2
0
def _batched_metadata(json_metadata, sorted_uuids, uuid_key_name, obj_id_fmt):
    batched_metadata = []
    for batch_index, uuid_batch in enumerate(
            batch(sorted_uuids, batch_size=ALGOLIA_UUID_BATCH_SIZE)):
        json_metadata_with_uuids = copy.deepcopy(json_metadata)
        json_metadata_with_uuids.update({
            'objectID':
            obj_id_fmt.format(json_metadata['objectID'], batch_index),
            uuid_key_name:
            uuid_batch,
        })
        batched_metadata.append(json_metadata_with_uuids)
    return batched_metadata
Пример #3
0
def associate_content_metadata_with_query(metadata, catalog_query):
    """
    Creates or updates a ContentMetadata object for each entry in `metadata`,
    and then associates that object with the `catalog_query` provided.

    Arguments:
        metadata (list): List of content metadata dictionaries.
        catalog_query (CatalogQuery): CatalogQuery object

    Returns:
        list: The list of content_keys for the metadata associated with the query.
    """
    metadata_list = []
    for batched_metadata in batch(metadata, batch_size=100):
        content_keys = [get_content_key(entry) for entry in batched_metadata]
        existing_metadata = ContentMetadata.objects.filter(
            content_key__in=content_keys)
        existing_metadata_by_key = {
            metadata.content_key: metadata
            for metadata in existing_metadata
        }
        existing_metadata_defaults, nonexisting_metadata_defaults = _partition_content_metadata_defaults(
            batched_metadata, existing_metadata_by_key)

        # Update existing ContentMetadata records
        updated_metadata = _update_existing_content_metadata(
            existing_metadata_defaults, existing_metadata_by_key)
        metadata_list.extend(updated_metadata)

        # Create new ContentMetadata records
        created_metadata = _create_new_content_metadata(
            nonexisting_metadata_defaults)
        metadata_list.extend(created_metadata)

    # Setting `clear=True` will remove all prior relationships between
    # the CatalogQuery's associated ContentMetadata objects
    # before setting all new relationships from `metadata_list`.
    # https://docs.djangoproject.com/en/2.2/ref/models/relations/#django.db.models.fields.related.RelatedManager.set
    catalog_query.contentmetadata_set.set(metadata_list, clear=True)
    associated_content_keys = [
        metadata.content_key for metadata in metadata_list
    ]
    return associated_content_keys
Пример #4
0
def _fetch_courses_by_keys(course_keys):
    """
    Fetches course data from discovery's /api/v1/courses endpoint for the provided course keys.

    Args:
        course_keys (list of str): Content keys for Course ContentMetadata objects.
    Returns:
        list of dict: Returns a list of dictionaries where each dictionary represents the course data from discovery.
    """
    courses = []
    discovery_client = DiscoveryApiClient()

    # Batch the course keys into smaller chunks so that we don't send too big of a request to discovery
    batched_course_keys = batch(course_keys,
                                batch_size=DISCOVERY_COURSE_KEY_BATCH_SIZE)
    for course_keys_chunk in batched_course_keys:
        # Discovery expects the keys param to be in the format ?keys=course1,course2,...
        query_params = {'keys': ','.join(course_keys_chunk)}
        courses.extend(discovery_client.get_courses(query_params=query_params))

    return courses
Пример #5
0
def _fetch_courses_by_keys(course_keys):
    """
    Fetches course data from discovery's /api/v1/courses endpoint for the provided course keys.

    Args:
        course_keys (list of str): Content keys for Course ContentMetadata objects.
    Returns:
        list of dict: Returns a list of dictionaries where each dictionary represents the course data from discovery.
    """
    courses = []
    course_keys_to_fetch = []
    discovery_client = DiscoveryApiClient()
    timeout_seconds = settings.DISCOVERY_COURSE_DATA_CACHE_TIMEOUT

    # Populate a new list of course keys that haven't been updated recently to request from the Discovery API.
    for key in course_keys:
        content_metadata = ContentMetadata.objects.filter(content_key=key)
        if not content_metadata:
            continue
        if timezone.now() - content_metadata[0].modified > timedelta(
                seconds=timeout_seconds):
            courses.append(content_metadata[0].json_metadata)
            logger.info(
                'ContentMetadata with key %s has recently been updated and will not be requested from Discovery API',
                key,
            )
        else:
            course_keys_to_fetch.append(key)

    # Batch the course keys into smaller chunks so that we don't send too big of a request to discovery
    batched_course_keys = batch(course_keys_to_fetch,
                                batch_size=DISCOVERY_COURSE_KEY_BATCH_SIZE)
    for course_keys_chunk in batched_course_keys:
        # Discovery expects the keys param to be in the format ?keys=course1,course2,...
        query_params = {'keys': ','.join(course_keys_chunk)}
        courses.extend(discovery_client.get_courses(query_params=query_params))

    return courses
Пример #6
0
def index_content_keys_in_algolia(content_keys, algolia_client):
    """
    Determines list of Algolia objects to include in the Algolia index based on the
    specified content keys, and replaces all existing objects with the new ones in an atomic reindex.

    Arguments:
        content_keys (list): List of indexable content_key strings.
        algolia_client: Instance of an Algolia API client
    """
    logger.info(
        'There are {} total content keys to include in the Algolia index.'.
        format(len(content_keys)))
    courses = []
    for content_keys_batch in batch(content_keys, batch_size=TASK_BATCH_SIZE):
        catalog_uuids_by_course_key = defaultdict(set)
        catalog_query_uuids_by_course_key = defaultdict(set)
        customer_uuids_by_course_key = defaultdict(set)

        # retrieve ContentMetadata records that match the specified content_keys in the
        # content_key or parent_content_key. returns both courses and course runs.
        query = Q(content_key__in=content_keys_batch) | Q(
            parent_content_key__in=content_keys_batch)

        catalog_queries = CatalogQuery.objects.prefetch_related(
            'enterprise_catalogs', )
        content_metadata = ContentMetadata.objects.filter(
            query).prefetch_related(
                Prefetch('catalog_queries', queryset=catalog_queries), )

        # iterate through ContentMetadata records, retrieving the enterprise_catalog_uuids
        # and enterprise_customer_uuids associated with each ContentMetadata record (either
        # a course or a course run), storing them in a dictionary with the related course's
        # content_key as a key for later retrieval. the course's content_key is determined by
        # the content_key field if the metadata is a `COURSE` or by the parent_content_key
        # field if the metadata is a `COURSE_RUN`.
        for metadata in content_metadata:
            is_course_content_type = metadata.content_type == COURSE
            course_content_key = metadata.content_key if is_course_content_type else metadata.parent_content_key
            associated_queries = metadata.catalog_queries.all()
            enterprise_catalog_uuids = set()
            enterprise_catalog_query_uuids = set()
            enterprise_customer_uuids = set()
            for query in associated_queries:
                enterprise_catalog_query_uuids.add(str(query.uuid))
                associated_catalogs = query.enterprise_catalogs.all()
                for catalog in associated_catalogs:
                    enterprise_catalog_uuids.add(str(catalog.uuid))
                    enterprise_customer_uuids.add(str(catalog.enterprise_uuid))

            # add to any existing enterprise catalog uuids, enterprise customer uuids or catalog query uuids
            catalog_uuids_by_course_key[course_content_key].update(
                enterprise_catalog_uuids)
            customer_uuids_by_course_key[course_content_key].update(
                enterprise_customer_uuids)
            catalog_query_uuids_by_course_key[course_content_key].update(
                enterprise_catalog_query_uuids)

        # iterate through only the courses, retrieving the enterprise-related uuids from the
        # dictionary created above. there is at least 2 duplicate course records per course,
        # each including the catalog uuids and customer uuids respectively.
        #
        # if the number of uuids for both catalogs/customers exceeds ALGOLIA_UUID_BATCH_SIZE, then
        # create duplicate course records, batching the uuids (flattened records) to reduce
        # the payload size of the Algolia objects.
        course_content_metadata = content_metadata.filter(content_type=COURSE)
        for metadata in course_content_metadata:
            content_key = metadata.content_key
            if _was_recently_indexed(content_key):
                continue

            # add enterprise-related uuids to json_metadata
            json_metadata = copy.deepcopy(metadata.json_metadata)
            json_metadata.update({
                'objectID':
                get_algolia_object_id(json_metadata.get('uuid')),
            })

            # enterprise catalog uuids
            catalog_uuids = sorted(
                list(catalog_uuids_by_course_key[content_key]))
            batched_metadata = _batched_metadata(
                json_metadata,
                catalog_uuids,
                'enterprise_catalog_uuids',
                '{}-catalog-uuids-{}',
            )
            courses.extend(batched_metadata)

            # enterprise customer uuids
            customer_uuids = sorted(
                list(customer_uuids_by_course_key[content_key]))
            batched_metadata = _batched_metadata(
                json_metadata,
                customer_uuids,
                'enterprise_customer_uuids',
                '{}-customer-uuids-{}',
            )
            courses.extend(batched_metadata)
            _mark_recently_indexed(content_key)

            # enterprise catalog query uuids
            query_uuids = sorted(
                list(catalog_query_uuids_by_course_key[content_key]))
            batched_metadata = _batched_metadata(
                json_metadata,
                query_uuids,
                'enterprise_catalog_query_uuids',
                '{}-catalog-query-uuids-{}',
            )
            courses.extend(batched_metadata)

    # extract out only the fields we care about and send to Algolia index
    algolia_objects = create_algolia_objects_from_courses(
        courses, ALGOLIA_FIELDS)
    algolia_client.replace_all_objects(algolia_objects)
Пример #7
0
def _update_full_content_metadata(content_keys):
    """
    Given content_keys, finds the associated ContentMetadata records with a type of course and looks up the full
    course metadata from discovery's /api/v1/cousres endpoint to pad the ContentMetadata objects with. The course
    metadata is merged with the existing contents of the json_metadata field for each ContentMetadata record.

    Args:
        content_keys (list of str): A list of content keys representing ContentMetadata objects that should have their
            metadata updated with the full Course metadata. This list gets filtered down to only those representing
            Course ContentMetadata objects.

    Returns:
        list of str: Returns the course keys that were updated and should be indexed in Algolia
            by the B2C logic. This is passed to the `index_enterprise_catalog_courses_in_algolia_task` from
            the `EnterpriseCatalogRefreshDataFromDiscovery` view.
    """
    indexable_course_keys = []
    for content_keys_batch in batch(content_keys, batch_size=TASK_BATCH_SIZE):
        full_course_dicts = _fetch_courses_by_keys(content_keys_batch)
        if not full_course_dicts:
            logger.info(
                'No courses were retrieved from course-discovery in this batch.'
            )
            continue

        # Build a dictionary of the metadata that corresponds to the fetched keys to avoid a query for every course
        fetched_course_keys = [course['key'] for course in full_course_dicts]
        metadata_records_for_fetched_keys = ContentMetadata.objects.filter(
            content_key__in=fetched_course_keys, )
        metadata_by_key = {
            metadata.content_key: metadata
            for metadata in metadata_records_for_fetched_keys
        }

        # Iterate through the courses to update the json_metadata field,
        # merging the minimal json_metadata retrieved by
        # `/search/all/` with the full json_metadata retrieved by `/courses/`.
        modified_content_metadata_records = []
        for course_metadata_dict in full_course_dicts:
            content_key = course_metadata_dict.get('key')
            metadata_record = metadata_by_key.get(content_key)
            if not metadata_record:
                logger.error(
                    'Could not find ContentMetadata record for content_key %s.',
                    content_key)
                continue

            metadata_record.json_metadata.update(course_metadata_dict)
            modified_content_metadata_records.append(metadata_record)

        ContentMetadata.objects.bulk_update(
            modified_content_metadata_records,
            ['json_metadata'],
            batch_size=10,
        )

        logger.info(
            'Successfully updated %d of %d ContentMetadata records with full metadata from course-discovery.',
            len(modified_content_metadata_records),
            len(full_course_dicts),
        )

        # record the course keys that were updated and should be indexed in Algolia by the B2C logic
        indexable_course_keys, __ = partition_course_keys_for_indexing(
            modified_content_metadata_records)
        indexable_course_keys.extend(indexable_course_keys)

    logger.info(
        '{} total course keys were updated and are ready for indexing in Algolia'
        .format(len(indexable_course_keys)))
Пример #8
0
def update_full_content_metadata_task(content_keys):
    """
    Given content_keys, finds the associated ContentMetadata records with a type of course and looks up the full
    course metadata from discovery's /api/v1/cousres endpoint to pad the ContentMetadata objects with. The course
    metadata is merged with the existing contents of the json_metadata field for each ContentMetadata record.

    Note: This task increases the maximum ``soft_time_limit`` and ``time_limit`` options since the task traverses large
    portions of course-discovery's /courses/ endpoint, which was previously exceeding the default
    ``CELERY_TASK_SOFT_TIME_LIMIT`` and ``CELERY_TASK_TIME_LIMIT``, causing a SoftTimeLimitExceeded exception.

    Args:
        content_keys (list of str): A list of content keys representing ContentMetadata objects that should have their
            metadata updated with the full Course metadata. This list gets filtered down to only those representing
            Course ContentMetadata objects.

    Returns:
        list of str: Returns the course keys that were updated and should be indexed in Algolia
            by the B2C logic. This is passed to the `index_enterprise_catalog_courses_in_algolia_task` from
            the `EnterpriseCatalogRefreshDataFromDiscovery` view.
    """
    indexable_course_keys = []
    for content_keys_batch in batch(content_keys, batch_size=TASK_BATCH_SIZE):
        course_keys_for_updating = _get_course_keys_for_updating(
            content_keys_batch)

        courses = _fetch_courses_by_keys(course_keys_for_updating)
        if not courses:
            logger.info(
                'No courses were retrieved from course-discovery in this batch.'
            )
            continue
        logger.info(
            'Retrieved %d courses from course-discovery in this batch.',
            len(courses))

        # Iterate through the courses to update the json_metadata field, merging the minimal json_metadata retrieved by
        # /search/all/ with the full json_metadata retrieved by /courses/.
        fetched_course_keys = [course['key'] for course in courses]
        metadata_for_fetched_keys = ContentMetadata.objects.filter(
            content_key__in=fetched_course_keys)
        # Build a dictionary of the metadata that corresponds to the fetched keys to avoid a query for every course
        metadata_by_key = {
            metadata.content_key: metadata
            for metadata in metadata_for_fetched_keys
        }
        updated_metadata = []
        for course_metadata in courses:
            content_key = course_metadata.get('key')
            metadata_record = metadata_by_key.get(content_key)
            if not metadata_by_key:
                logger.error(
                    'Could not find ContentMetadata record for content_key %s.',
                    content_key)
                continue

            # merge the original json_metadata with the full course_metadata to ensure
            # we're not removing any critical fields, e.g. "aggregation_key".
            json_metadata = metadata_record.json_metadata.copy()
            json_metadata.update(course_metadata)
            metadata_record.json_metadata = json_metadata
            updated_metadata.append(metadata_record)
        ContentMetadata.objects.bulk_update(updated_metadata,
                                            ['json_metadata'],
                                            batch_size=10)

        logger.info(
            'Successfully updated %d of %d ContentMetadata records with full metadata from course-discovery.',
            len(updated_metadata),
            len(courses),
        )

        # record the course keys that were updated and should be indexed in Algolia by the B2C logic
        indexable_course_keys.extend(
            get_indexable_course_keys(updated_metadata))

    logger.info(
        '{} total course keys were updated and are ready for indexing in Algolia'
        .format(len(indexable_course_keys)))
    return indexable_course_keys
Пример #9
0
def index_enterprise_catalog_courses_in_algolia_task(
    content_keys,
    algolia_fields,
    uuid_batch_size=ALGOLIA_UUID_BATCH_SIZE,
):
    """
    Index course data in Algolia with enterprise-related fields.

    Arguments:
        content_keys (list): A list of content_keys.  It's important that this is the first positional argument,
            so that the passing of return values to the signature of the next chained celery task
            works as expected.
        algolia_fields (list): A list of course fields we want to index in Algolia
        uuid_batch_size (int): The threshold of distinct catalog/customer UUIDs associated with a piece of content,
            at which duplicate course records are created in the index,
            batching the uuids (flattened records) to reduce the payload size of the Algolia objects.
            Defaults to ``ALGOLIA_UUID_BATCH_SIZE``.
    """
    algolia_client = get_initialized_algolia_client()

    if not algolia_fields or not content_keys:
        logger.error(
            'Must provide algolia_fields and content_keys as arguments.')
        return

    # Update the index in batches
    for content_keys_batch in batch(content_keys, batch_size=TASK_BATCH_SIZE):
        courses = []
        catalog_uuids_by_course_key = defaultdict(set)
        customer_uuids_by_course_key = defaultdict(set)

        # retrieve ContentMetadata records that match the specified content_keys in the
        # content_key or parent_content_key. returns both courses and course runs.
        query = Q(content_key__in=content_keys_batch) | Q(
            parent_content_key__in=content_keys_batch)

        catalog_queries = CatalogQuery.objects.prefetch_related(
            'enterprise_catalogs', )
        content_metadata = ContentMetadata.objects.filter(
            query).prefetch_related(
                Prefetch('catalog_queries', queryset=catalog_queries), )

        # iterate through ContentMetadata records, retrieving the enterprise_catalog_uuids
        # and enterprise_customer_uuids associated with each ContentMetadata record (either
        # a course or a course run), storing them in a dictionary with the related course's
        # content_key as a key for later retrieval. the course's content_key is determined by
        # the content_key field if the metadata is a `COURSE` or by the parent_content_key
        # field if the metadata is a `COURSE_RUN`.
        for metadata in content_metadata:
            is_course_content_type = metadata.content_type == COURSE
            course_content_key = metadata.content_key if is_course_content_type else metadata.parent_content_key
            associated_queries = metadata.catalog_queries.all()
            enterprise_catalog_uuids = set()
            enterprise_customer_uuids = set()
            for query in associated_queries:
                associated_catalogs = query.enterprise_catalogs.all()
                for catalog in associated_catalogs:
                    enterprise_catalog_uuids.add(str(catalog.uuid))
                    enterprise_customer_uuids.add(str(catalog.enterprise_uuid))

            # add to any existing enterprise catalog uuids or enterprise customer uuids
            catalog_uuids_by_course_key[course_content_key].update(
                enterprise_catalog_uuids)
            customer_uuids_by_course_key[course_content_key].update(
                enterprise_customer_uuids)

        # iterate through only the courses, retrieving the enterprise-related uuids from the
        # dictionary created above. there is at least 2 duplicate course records per course,
        # each including the catalog uuids and customer uuids respectively.
        #
        # if the number of uuids for both catalogs/customers exceeds uuid_batch_size, then
        # create duplicate course records, batching the uuids (flattened records) to reduce
        # the payload size of the Algolia objects.
        course_content_metadata = content_metadata.filter(content_type=COURSE)
        for metadata in course_content_metadata:
            content_key = metadata.content_key
            # add enterprise-related uuids to json_metadata
            json_metadata = copy.deepcopy(metadata.json_metadata)
            json_metadata.update({
                'objectID':
                get_algolia_object_id(json_metadata.get('uuid')),
            })

            # enterprise catalog uuids
            catalog_uuids = sorted(
                list(catalog_uuids_by_course_key[content_key]))
            batched_metadata = _batched_metadata(
                json_metadata,
                catalog_uuids,
                'enterprise_catalog_uuids',
                '{}-catalog-uuids-{}',
                uuid_batch_size,
            )
            courses.extend(batched_metadata)

            # enterprise customer uuids
            customer_uuids = sorted(
                list(customer_uuids_by_course_key[content_key]))
            batched_metadata = _batched_metadata(
                json_metadata,
                customer_uuids,
                'enterprise_customer_uuids',
                '{}-customer-uuids-{}',
                uuid_batch_size,
            )
            courses.extend(batched_metadata)

        # extract out only the fields we care about and send to Algolia index
        algolia_objects = create_algolia_objects_from_courses(
            courses, algolia_fields)
        algolia_client.partially_update_index(algolia_objects)