예제 #1
0
def count_removed_resources(destination, channel_id):
    """
    Queries the destination db to get the leaf node content_ids.
    Subtract available leaf nodes count on default db by available
    leaf nodes based on destination db leaf node content_ids.
    """
    bridge = Bridge(app_name=CONTENT_APP_NAME, sqlite_file_path=destination)
    connection = bridge.get_connection()
    ContentNodeTable = bridge.get_table(ContentNode)
    resource_node_ids_statement = (select([ContentNodeTable.c.id]).where(
        and_(
            ContentNodeTable.c.channel_id == channel_id,
            ContentNodeTable.c.kind != content_kinds.TOPIC,
        )).limit(batch_size))

    i = 0

    resource_node_ids = [
        coerce_key(cid[0]) for cid in connection.execute(
            resource_node_ids_statement.offset(i)).fetchall()
    ]

    content_ids_after_upgrade = set()

    # Batch the query here, as passing too many uuids into Django could cause
    # the a SQL query too large error. This will happen around about 30000+ uuids.
    # Could probably batch at 10000 rather than 1000 - but using 1000 to be defensive.

    while resource_node_ids:
        content_ids_after_upgrade.update((ContentNode.objects.filter_by_uuids(
            resource_node_ids,
            validate=False).exclude(kind=content_kinds.TOPIC).filter(
                available=True,
                channel_id=channel_id).values_list("content_id",
                                                   flat=True).distinct()))

        i += batch_size
        resource_node_ids = [
            coerce_key(cid[0]) for cid in connection.execute(
                resource_node_ids_statement.offset(i)).fetchall()
        ]

    total_resources_after_upgrade = len(content_ids_after_upgrade)

    return (ContentNode.objects.filter(
        channel_id=channel_id, available=True).exclude(
            kind=content_kinds.TOPIC).values("content_id").distinct().count() -
            total_resources_after_upgrade)
예제 #2
0
def get_automatically_updated_resources(destination, channel_id):
    """
    Queries the destination db to get the leaf node ids, where local file objects are unavailable.
    Get the available node ids related to those missing file objects.
    """
    bridge = Bridge(app_name=CONTENT_APP_NAME, sqlite_file_path=destination)
    connection = bridge.get_connection()
    ContentNodeTable = bridge.get_table(ContentNode)
    # SQL Alchemy reference to the file table - a mapping from
    # contentnodes to the files that they use
    FileTable = bridge.get_table(File)
    # SQL Alchemy reference to the localfile table which tracks
    # information about the files on disk, such as availability
    LocalFileTable = bridge.get_table(LocalFile)
    # get unavailable local file ids on the destination db
    unavailable_local_file_ids_statement = select([
        LocalFileTable.c.id
    ]).where(LocalFileTable.c.available == False  # noqa
             )
    # get the Contentnode ids where File objects are missing in the destination db
    contentnode_ids_statement = (
        select([FileTable.c.contentnode_id]).where(
            and_(
                FileTable.c.local_file_id.in_(
                    unavailable_local_file_ids_statement),
                FileTable.c.supplementary == False,  # noqa
                or_(*(FileTable.c.preset == preset
                      for preset in renderable_files_presets)),
            )).limit(batch_size))

    i = 0

    updated_resource_ids = set()

    updated_resource_content_ids = set()

    contentnode_ids = [
        coerce_key(cid[0]) for cid in connection.execute(
            contentnode_ids_statement.offset(i)).fetchall()
    ]

    while contentnode_ids:
        # Exclude topics from here to prevent erroneous imports of their children
        # This should already be excluded as we are filtering to renderable files
        # so this is more of a sanity check
        for c in (ContentNode.objects.filter_by_uuids(
                contentnode_ids, validate=False).filter(
                    available=True, channel_id=channel_id).exclude(
                        kind=content_kinds.TOPIC).values_list(
                            "id", "content_id")):
            updated_resource_ids.add(c[0])
            updated_resource_content_ids.add(c[1])

        i += batch_size

        contentnode_ids = [
            coerce_key(cid[0]) for cid in connection.execute(
                contentnode_ids_statement.offset(i)).fetchall()
        ]

    # Do this after we have fetched all the ids and made them unique
    # otherwise, because we are getting our ids from the File table, we could
    # end up with a duplicate count of file sizes

    updated_resources_total_size = 0

    i = 0

    # Coerce to lists
    updated_resource_ids = list(updated_resource_ids)
    updated_resource_content_ids = list(updated_resource_content_ids)

    ids_batch = updated_resource_ids[i:i + batch_size]

    while ids_batch:

        contentnode_filter_expression = filter_by_uuids(
            ContentNodeTable.c.id, ids_batch, vendor=bridge.engine.name)

        # This does the first step in the many to many lookup for File
        updated_resources_total_size += connection.execute(
            select([func.sum(LocalFileTable.c.file_size)]).where(
                LocalFileTable.c.id.in_(
                    select([LocalFileTable.c.id]).select_from(
                        # and LocalFile.
                        LocalFileTable.join(
                            FileTable.join(
                                ContentNodeTable,
                                FileTable.c.contentnode_id ==
                                ContentNodeTable.c.id,
                            ),  # This does the actual correlation between file and local file
                            FileTable.c.local_file_id == LocalFileTable.c.id,
                        )).
                    where(
                        and_(
                            # Filter only for files that are unavailable so we show
                            # the import size
                            LocalFileTable.c.available == False,  # noqa
                            contentnode_filter_expression,
                        ))))).fetchone()[0]

        i += batch_size

        ids_batch = updated_resource_ids[i:i + batch_size]

    return (
        updated_resource_ids,
        updated_resource_content_ids,
        updated_resources_total_size,
    )
예제 #3
0
def get_new_resources_available_for_import(destination, channel_id):
    """
    Queries the destination db to get leaf nodes.
    Subtract total number of leaf nodes by the count of leaf nodes on default db to get the number of new resources.
    """
    bridge = Bridge(app_name=CONTENT_APP_NAME, sqlite_file_path=destination)
    # SQL Alchemy reference to the content node table
    ContentNodeTable = bridge.get_table(ContentNode)
    # SQL Alchemy reference to the file table - a mapping from
    # contentnodes to the files that they use
    FileTable = bridge.get_table(File)
    # SQL Alchemy reference to the localfile table which tracks
    # information about the files on disk, such as availability
    LocalFileTable = bridge.get_table(LocalFile)
    connection = bridge.get_connection()

    # To efficiently get the node ids of all new nodes in the channel
    # we are going to iterate over the currently existing nodes for the
    # channel in the default database, and cache their existence in the
    # temporary upgrade database by flagging them as 'available' in there
    # We can then read out all of the unavailable ContentNodes in order
    # to get a complete list of the newly available ids.
    # We wrap this all in a transaction so that we can roll it back afterwards
    # but this is mostly just not to leave the upgrade DB in a messy state
    # and could be removed if it becomes a performance concern

    # Create a queryset for the node ids of resources currently in this channel
    # we will slice this later in a while loop in order to efficiently process this
    # this is necessary otherwise we would end up querying tens of thousands of node ids
    # for a large channel, which would then be impossible to pass into an update query
    # for the temporary upgrade DB without causing an excessively large query
    # greater than 1MB, which is the default max for SQLite
    current_resource_node_id_queryset = (ContentNode.objects.filter(
        channel_id=channel_id).exclude(kind=content_kinds.TOPIC).values_list(
            "id", flat=True))

    i = 0

    # start a transaction

    trans = connection.begin()

    # Set everything to False to start with
    connection.execute(ContentNodeTable.update().where(
        ContentNodeTable.c.channel_id == channel_id).values(available=False))

    node_ids = current_resource_node_id_queryset[i:i + batch_size]
    while node_ids:
        # Set everything to False to start with
        connection.execute(ContentNodeTable.update().where(
            and_(
                filter_by_uuids(ContentNodeTable.c.id,
                                node_ids,
                                vendor=bridge.engine.name),
                ContentNodeTable.c.channel_id == channel_id,
            )).values(available=True))
        i += batch_size
        node_ids = current_resource_node_id_queryset[i:i + batch_size]

    renderable_contentnodes = (
        select([FileTable.c.contentnode_id
                ]).where(FileTable.c.supplementary == False)  # noqa
        .where(
            or_(*(FileTable.c.preset == preset
                  for preset in renderable_files_presets))))

    contentnode_filter_expression = and_(
        ContentNodeTable.c.channel_id == channel_id,
        ContentNodeTable.c.kind != content_kinds.TOPIC,
        ContentNodeTable.c.available == False,  # noqa
        ContentNodeTable.c.id.in_(renderable_contentnodes),
    )

    new_resource_nodes_total_size = (
        connection.execute(
            # This does the first step in the many to many lookup for File
            select([func.sum(LocalFileTable.c.file_size)]).where(
                LocalFileTable.c.id.in_(
                    select([LocalFileTable.c.id]).select_from(
                        # and LocalFile.
                        LocalFileTable.join(
                            FileTable.join(
                                ContentNodeTable,
                                FileTable.c.contentnode_id
                                == ContentNodeTable.c.id,
                            ),  # This does the actual correlation between file and local file
                            FileTable.c.local_file_id == LocalFileTable.c.id,
                        )).
                    where(
                        and_(
                            # Filter only for files that are unavailable so we show
                            # the import size
                            LocalFileTable.c.available == False,  # noqa
                            contentnode_filter_expression,
                        ))))).fetchone()[0] or 0)

    new_resource_node_ids_statement = select([ContentNodeTable.c.id]).where(
        and_(
            ContentNodeTable.c.channel_id == channel_id,
            ContentNodeTable.c.kind != content_kinds.TOPIC,
            ContentNodeTable.c.available == False,  # noqa
        ))

    new_resource_node_ids = list(
        coerce_key(c[0]) for c in connection.execute(
            new_resource_node_ids_statement).fetchall())

    trans.rollback()

    # Create a queryset for the content_ids of resources currently in this channel
    # we will slice this later in a while loop in order to efficiently process this
    # this is necessary otherwise we would end up querying tens of thousands of node ids
    # for a large channel, which would then be impossible to pass into an update query
    # for the temporary upgrade DB without causing an excessively large query
    # greater than 1MB, which is the default max for SQLite
    current_resource_content_id_queryset = (ContentNode.objects.filter(
        channel_id=channel_id).exclude(kind=content_kinds.TOPIC).values_list(
            "content_id", flat=True))

    i = 0

    # start a transaction

    trans = connection.begin()

    # Set everything to False to start with
    connection.execute(ContentNodeTable.update().where(
        ContentNodeTable.c.channel_id == channel_id).values(available=False))

    content_ids = current_resource_content_id_queryset[i:i + batch_size]
    while content_ids:
        # Set everything to False to start with
        connection.execute(ContentNodeTable.update().where(
            and_(
                filter_by_uuids(
                    ContentNodeTable.c.content_id,
                    content_ids,
                    vendor=bridge.engine.name,
                ),
                ContentNodeTable.c.channel_id == channel_id,
            )).values(available=True))
        i += batch_size
        content_ids = current_resource_content_id_queryset[i:i + batch_size]

    new_resource_content_ids_statement = (
        select([ContentNodeTable.c.content_id]).where(
            and_(
                ContentNodeTable.c.channel_id == channel_id,
                ContentNodeTable.c.kind != content_kinds.TOPIC,
                ContentNodeTable.c.available == False,  # noqa
            )).distinct())

    new_resource_content_ids = list(
        coerce_key(c[0]) for c in connection.execute(
            new_resource_content_ids_statement).fetchall())

    trans.rollback()

    return (
        new_resource_node_ids,
        new_resource_content_ids,
        new_resource_nodes_total_size,
    )