示例#1
0
def update_offset(name, offset):
    """
    Update current offset to the database
    """
    with scoped_session() as db:
        sync_status = _get_sync_status(db, name)
        sync_status.offset = offset
def freeze_running_object(object_id, sip_id, freeze_reason):
    """
    Cancel and freeze a MuseumObject that is currently in the workflow,
    and mark the SIP as cancelled if one was created.
    """
    with scoped_session() as db:
        museum_object = (db.query(MuseumObject).filter(
            MuseumObject.id == object_id).one())

        museum_object.frozen = True
        museum_object.freeze_reason = freeze_reason
        museum_object.freeze_source = FreezeSource.AUTOMATIC

        is_same_package = (museum_object.latest_package
                           and museum_object.latest_package.sip_id == sip_id)

        # If package was created, cancel it
        if is_same_package:
            museum_object.latest_package.cancelled = True

        # Copy log files to the archive if they were created
        try:
            museum_package = MuseumObjectPackage.from_path_sync(
                Path(PACKAGE_DIR) / str(object_id), sip_id=sip_id)
            museum_package.copy_log_files_to_archive(ARCHIVE_DIR)
        except FileNotFoundError:
            # No object directory and/or log files were created for this
            # package yet
            pass

        try:
            shutil.rmtree(Path(PACKAGE_DIR) / str(object_id))
        except FileNotFoundError:
            # Object directory didn't exist yet
            pass
示例#3
0
def update_sip(sip, sftp, queue):
    """
    Update a single SIP by downloading its ingest reports and enqueing the
    final task to confirm it
    """
    with scoped_session() as db:
        db_museum_package = (db.query(MuseumPackage).join(
            MuseumObject,
            MuseumObject.id == MuseumPackage.museum_object_id).filter(
                and_(MuseumPackage.sip_filename == sip.sip_filename,
                     MuseumPackage.preserved == False,
                     MuseumPackage.rejected == False)).one_or_none())

        if not db_museum_package:
            return

        if sip.status == "accepted":
            # Package was accepted
            db_museum_package.preserved = True
        elif sip.status == "rejected":
            db_museum_package.rejected = True

        object_id = db_museum_package.museum_object.id
        package_dir = Path(PACKAGE_DIR) / str(object_id)
        museum_package = MuseumObjectPackage.from_path_sync(package_dir)

        xml_temp_path = museum_package.log_dir / "ingest-report.xml.download"
        xml_report_path = museum_package.log_dir / "ingest-report.xml"

        # HTML report also exists with the same path and name, but different
        # suffix
        html_remote_path = sip.report_path.with_suffix(".html")

        html_temp_path = museum_package.log_dir / "ingest-report.html.download"
        html_report_path = museum_package.log_dir / "ingest-report.html"

        # Download ingest report to the log directory
        sftp.get(str(sip.report_path), str(xml_temp_path))
        os.rename(xml_temp_path, xml_report_path)

        sftp.get(str(html_remote_path), str(html_temp_path))
        os.rename(html_temp_path, html_report_path)

        # Remove the directory containing the rejected SIP so that the DPRES
        # service does not store the package unnecessarily
        if sip.status == "rejected":
            sftp_rmtree(sftp, sip.transfer_path)

        # Write the status for use by the 'confirm_sip' task
        (package_dir / f"{sip.sip_filename}.status").write_text(sip.status)

        # Enqueue the final task
        queue.enqueue(confirm_sip,
                      kwargs={
                          "object_id": object_id,
                          "sip_id": db_museum_package.sip_id
                      },
                      job_id=f"confirm_sip_{object_id}")
def unfreeze_objects(reason=None, object_ids=None, enqueue=False):
    """
    Unfreeze objects with the given reason and/or object IDs.

    This allows them to be preserved again.

    :param str reason: Unfreeze objects with this reason
    :param list object_ids: Objects to unfreeze.
    :param bool enqueue: Whether to enqueue the unfrozen objects immediately.
                         Default is False.
    """
    connect_db()

    if not reason and not object_ids:
        raise ValueError("Either 'reason' or 'object_ids' has to be provided")

    with lock_queues():
        with scoped_session() as db:
            query = (
                db.query(MuseumObject)
                .outerjoin(
                    MuseumPackage,
                    MuseumPackage.id == MuseumObject.latest_package_id
                )
                .filter(MuseumObject.frozen == True)
            )

            if reason:
                query = query.filter(MuseumObject.freeze_reason == reason)
            if object_ids:
                object_ids = [int(object_id) for object_id in object_ids]
                query = query.filter(MuseumObject.id.in_(object_ids))

            museum_objects = list(query)
            for museum_object in museum_objects:
                museum_object.frozen = False
                museum_object.freeze_reason = None
                museum_object.freeze_source = None

                # Remove the latest package if it was *not* successfully
                # preserved to ensure the object is eligible for preservation
                remove_latest_package = (
                    museum_object.latest_package
                    and not museum_object.latest_package.preserved
                )

                if remove_latest_package:
                    museum_object.latest_package = None

                if enqueue:
                    enqueue_object(object_id=museum_object.id)

            return len(museum_objects)
def submit_sip(object_id, sip_id):
    """
    Submit SIP to the DPRES service.

    The next workflow task will be enqueued by 'sync_processed_sips' which
    periodically checks the processed SIPs
    """
    object_id = int(object_id)
    connect_db()

    package_dir = Path(PACKAGE_DIR) / str(object_id)

    # Retrieve the latest SIP filename
    museum_package = MuseumObjectPackage.from_path_sync(package_dir,
                                                        sip_id=sip_id)
    filename = museum_package.sip_filename

    with scoped_session() as db:
        package_uploaded = db.query(exists().where(
            and_(MuseumPackage.sip_filename == museum_package.sip_filename,
                 MuseumPackage.uploaded == True))).scalar()
        if package_uploaded:
            raise RuntimeError(f"Package {filename} already uploaded")

    print(f"Submitting {filename} for Object {object_id}")

    museum_package = main(object_id=object_id,
                          package_dir=PACKAGE_DIR,
                          sip_id=sip_id)

    print(f"Package {filename} submitted, removing local file")

    with scoped_session() as db:
        db_museum_package = db.query(MuseumPackage).filter_by(
            sip_filename=museum_package.sip_filename).one()
        db_museum_package.uploaded = True

    # Delete the generated SIP to free space
    os.remove(museum_package.sip_archive_path)
示例#6
0
def finish_sync_progress(name):
    """
    Finish the current synchronization run.

    This ensures the next synchronization run will only iterate a subset
    of entries from MuseumPlus, improving performance.
    """
    with scoped_session() as db:
        sync_status = _get_sync_status(db, name)

        # Next synchronization will start from beginning
        sync_status.offset = 0
        sync_status.prev_start_sync_date = sync_status.start_sync_date
        sync_status.start_sync_date = None
示例#7
0
def get_sync_status(name):
    """
    Load the SyncStatus instance and return it for reading
    """
    with scoped_session() as db:
        sync_status = _get_sync_status(db, name)

        # Return a read-only copy of the sync status to prevent having to deal
        # with a SQLAlchemy session that's not used for anything
        # TODO: Can we do this without having to use a namedtuple?
        return SyncStatusReadOnly(
            name=sync_status.name,
            start_sync_date=sync_status.start_sync_date,
            prev_start_sync_date=sync_status.prev_start_sync_date,
            offset=sync_status.offset)
def enqueue_objects(object_count, random=False, object_ids=None):
    """
    Enqueue given number of objects to the preservation workflow.

    :param int object_count: How many objects to enqueue at most
    :param bool random: Whether to enqueue objects at random instead
                        of in-order.
    :param list object_ids: Object IDs to enqueue. If provided, 'object_count'
                            and 'random' are ignored.
    """
    if object_ids:
        object_count = len(object_ids)

    with lock_queues():
        connect_db()
        enqueued_object_ids = get_enqueued_object_ids()

        new_job_count = 0

        with scoped_session() as db:
            object_query = (db.query(MuseumObject).with_transformation(
                MuseumObject.filter_preservation_pending).yield_per(500))

            if object_ids:
                object_query = object_query.filter(
                    MuseumObject.id.in_(object_ids))

            if random:
                object_query = object_query.order_by(func.random())

            for museum_object in object_query:
                if museum_object.id not in enqueued_object_ids:
                    enqueue_object(museum_object.id)
                    new_job_count += 1
                    print(f"Enqueued download_object_{museum_object.id}")

                if new_job_count >= object_count:
                    break

    print(f"{new_job_count} object(s) enqueued for download")

    return new_job_count
def reenqueue_object(object_id: int):
    """
    Re-enqueue rejected object into the workflow
    """
    object_id = int(object_id)
    connect_db()

    queue = get_queue(QueueType.DOWNLOAD_OBJECT)

    with scoped_session() as db:
        museum_object = (
            db.query(MuseumObject)
            .join(
                MuseumPackage,
                MuseumObject.latest_package_id == MuseumPackage.id
            )
            .filter(MuseumObject.id == object_id)
            .one()
        )

        if museum_object.latest_package and \
                not museum_object.latest_package.rejected:
            raise ValueError(
                f"Latest package {museum_object.latest_package.sip_filename} "
                f"wasn't rejected"
            )

        object_ids = get_enqueued_object_ids()

        if object_id in object_ids:
            raise ValueError(
                f"Object is still in the workflow and can't be re-enqueued"
            )

        museum_object.latest_package = None

        delete_jobs_for_object_id(object_id)

        queue.enqueue(
            download_object, kwargs={"object_id": object_id},
            job_id=f"download_object_{object_id}"
        )
示例#10
0
def confirm_sip(object_id, sip_id):
    """
    Confirm SIP that was either preserved or rejected by the DPRES service.
    This is the last step in the preservation workflow.
    """
    object_id = int(object_id)
    connect_db()

    package_dir = Path(PACKAGE_DIR) / str(object_id)

    museum_package = MuseumObjectPackage.from_path_sync(package_dir,
                                                        sip_id=sip_id)
    # '.status' file contains either the text 'accepted' or 'rejected'
    status = (museum_package.path /
              f"{museum_package.sip_filename}.status").read_text()

    if status not in ("accepted", "rejected"):
        raise ValueError(f"Invalid preservation status: {status}")

    print(f"Confirming SIP {museum_package.sip_filename}")
    main(object_id=object_id,
         package_dir=PACKAGE_DIR,
         archive_dir=ARCHIVE_DIR,
         sip_id=sip_id,
         status=status)

    with scoped_session() as db:
        db.query(MuseumPackage).filter_by(
            sip_filename=museum_package.sip_filename).update({
                MuseumPackage.preserved:
                bool(status == "accepted"),
                MuseumPackage.rejected:
                bool(status == "rejected")
            })

        if status == "accepted":
            db.query(MuseumObject).filter_by(id=object_id).update(
                {MuseumObject.preserved: True})

    print(f"SIP {museum_package.sip_filename} confirmed")
示例#11
0
def get_confirmed_sip_filenames(days: int) -> set:
    """
    Get a set of SIP filenames that have already been marked as preserved or
    rejected in the workflow.

    The SIPs can be safely skipped as they're either already confirmed or the
    corresponding workflow job has been enqueued.
    """
    # Find packages that are at most (days + 2) days old. The extra two days
    # are used to account for days that took longer to get processed for
    # whatever reason.
    cutoff = (datetime.datetime.now(datetime.timezone.utc) -
              datetime.timedelta(days=days + 2))

    with scoped_session() as db:
        query = (select([MuseumPackage.sip_filename
                         ]).where(MuseumPackage.created_date > cutoff).where(
                             or_(MuseumPackage.preserved,
                                 MuseumPackage.rejected)))
        results = db.execute(query)
        results = {result[0] for result in results}

    return results
def reset_workflow():
    """
    Reset workflow after a PostgreSQL backup restoration by removing in-process
    packages that were not submitted to the DPRES service but were still
    in the workflow at the time the backup was initiated.
    """
    with lock_queues():
        connect_db()

        with scoped_session() as db:
            # Get objects that have been downloaded or packaged, but which
            # haven't been uploaded yet
            objects = (db.query(MuseumObject).join(
                MuseumPackage,
                MuseumPackage.id == MuseumObject.latest_package_id).filter(
                    MuseumPackage.uploaded == False,
                    or_(MuseumPackage.downloaded, MuseumPackage.packaged)))
            objects = list(objects)

            print(f"Found {len(objects)} dangling objects")

            for mus_object in objects:
                mus_package = mus_object.latest_package

                # Remove the lingering package from the MuseumObject to make
                # the object eligible for preservation again.
                mus_object.latest_package = None
                db.delete(mus_package)

                try:
                    shutil.rmtree(Path(PACKAGE_DIR) / str(mus_object.id))
                except OSError:
                    # Directory does not exist; ignore
                    pass

    print("Done!")
示例#13
0
def create_sip(object_id, sip_id):
    """
    Create SIP from a downloaded objec and enqueue the task 'submit_sip'
    once the object is packaged into a SIP
    """
    object_id = int(object_id)
    connect_db()

    # Are we creating a SIP for the first time or updating a preserved
    # package?
    created_date, modified_date = None, None
    with scoped_session() as db:
        last_preserved_package = (
            db.query(MuseumPackage)
            .filter(MuseumPackage.museum_object_id == object_id)
            .filter(MuseumPackage.preserved == True)
            .order_by(MuseumPackage.created_date.desc())
            .first()
        )
        current_package = (
            db.query(MuseumObject)
            .join(
                MuseumPackage,
                MuseumObject.latest_package_id == MuseumPackage.id
            )
            .filter(MuseumObject.id == object_id)
            .one()
            .latest_package
        )

        if not last_preserved_package:
            # We haven't created a preserved SIP yet
            print(f"Creating submission SIP for Object {object_id}")
            created_date = current_package.created_date
        else:
            # We are updating an existing package
            print(f"Creating update SIP for Object {object_id}")
            created_date = last_preserved_package.created_date
            modified_date = current_package.created_date

    # Run the 'create_sip' script
    try:
        museum_package = main(
            object_id=object_id, package_dir=PACKAGE_DIR, sip_id=sip_id,
            create_date=created_date, modify_date=modified_date,
            update=bool(modified_date)
        )
    except PreservationError as exc:
        # If a PreservationError was raised, freeze the object and prevent
        # the object from going further in the workflow.
        freeze_running_object(
            object_id=object_id,
            sip_id=sip_id,
            freeze_reason=exc.error
        )
        return
    except OSError as exc:
        if exc.errno == errno.ENOSPC:
            raise OSError(
                errno.ENOSPC,
                "Ran out of disk space. This may have happened because the "
                "package directory ran out of space while downloading a "
                "large attachment. Try removing packages from the directory "
                "and trying again by processing less packages at the same "
                "time."
            )

        raise

    filename = museum_package.sip_filename

    print(f"Created SIP for Object {object_id}, updating database")

    with scoped_session() as db:
        db_package = db.query(MuseumPackage).filter(
            MuseumPackage.sip_filename == filename
        ).one()
        db_package.packaged = True
        db.query(MuseumObject).filter(
            MuseumObject.id == object_id
        ).update({MuseumObject.latest_package_id: db_package.id})

        queue = get_queue(QueueType.SUBMIT_SIP)
        queue.enqueue(
            submit_sip, kwargs={"object_id": object_id, "sip_id": sip_id},
            job_id=f"submit_sip_{object_id}"
        )
示例#14
0
async def sync_attachments(offset=0, limit=None, save_progress=False):
    """
    Synchronize attachment metadata from MuseumPlus to determine which
    objects have changed and need to be updated in the DPRES service. This
    is followed by 'sync_hashes'.

    :param int offset: Offset to start synchronizing from
    :param int limit: How many attachments to sync before stopping.
        Default is None, meaning all available attachments
        are synchronized.
    :param bool save_progress: Whether to save synchronization progress
                               and continue from the last run. Offset and limit
                               are ignored if enabled.
    """
    modify_date_gte = None

    if save_progress:
        limit = None

        sync_status = get_sync_status("sync_attachments")
        offset = sync_status.offset
        # Start synchronization from attachments that changed since the last
        # sync
        modify_date_gte = sync_status.prev_start_sync_date
        print(f"Continuing synchronization from {offset}")

    # TODO: This is pretty much an inverse version of 'sync_objects'.
    # This process should be made more generic if possible.
    museum_session = await get_museum_session()
    multimedia_iter = iterate_multimedia(
        session=museum_session, offset=offset,
        modify_date_gte=modify_date_gte
    )
    all_iterated = False
    index = offset
    processed = 0

    while True:
        results = []

        all_iterated = True
        async for result in multimedia_iter:
            all_iterated = False
            results.append(result)
            index += 1

            if len(results) >= CHUNK_SIZE:
                break

        attachments = {result["id"]: result for result in results}
        attachment_ids = list(attachments.keys())

        inserts, updates = 0, 0

        with scoped_session() as db:
            existing_attachment_ids = set([
                result.id for result in
                db.query(MuseumAttachment).options(load_only("id"))
                  .filter(MuseumAttachment.id.in_(attachment_ids))
            ])

            attachment_id2object_id = defaultdict(set)
            object_ids = set()

            update_params = []

            # Create existing objects, update the rest
            for result in attachments.values():
                attachment_id = int(result["id"])
                filename = result["filename"]
                modified_date = result["modified_date"]
                created_date = result["created_date"]
                xml_hash = result["xml_hash"]

                attachment_id2object_id[attachment_id].update(
                    result["object_ids"]
                )
                object_ids.update(result["object_ids"])

                if attachment_id in existing_attachment_ids:
                    # Update
                    update_params.append({
                        "_id": attachment_id,
                        "_filename": filename,
                        "_modified_date": modified_date,
                        "_created_date": created_date,
                        "_metadata_hash": xml_hash
                    })
                    updates += 1
                else:
                    # Create
                    mus_attachment = MuseumAttachment(
                        id=attachment_id,
                        filename=filename,
                        modified_date=modified_date,
                        created_date=created_date,
                        metadata_hash=xml_hash
                    )
                    db.add(mus_attachment)
                    inserts += 1

                processed += 1

                if limit is not None and processed == limit:
                    all_iterated = True
                    break

            if update_params:
                # Perform updates in bulk
                stmt = (
                    MuseumAttachment.__table__.update()
                    .where(MuseumAttachment.id == bindparam("_id"))
                    .values({
                        "filename": bindparam("_filename"),
                        "created_date": bindparam("_created_date"),
                        "modified_date": bindparam("_modified_date"),
                        "metadata_hash": bindparam("_metadata_hash")
                    })
                )
                db.execute(stmt, update_params)

            # Create/update MuseumObjects with references
            # to the newly updated MuseumAttachments.
            # For performance reasons update references for a batch
            # of objects at once
            attachments = (
                db.query(MuseumAttachment)
                .filter(MuseumAttachment.id.in_(attachment_ids))
            )
            objects = bulk_create_or_get(db, MuseumObject, object_ids)
            objects_by_id = {
                mus_object.id: mus_object for mus_object in objects
            }

            for attachment in attachments:
                attachment.museum_objects = [
                    objects_by_id[object_id] for object_id
                    in attachment_id2object_id[attachment.id]
                ]

                for museum_object in attachment.museum_objects:
                    # Set the modification date of MuseumObject to the same
                    # as the attachment's if it's newer.
                    # This is because we want to know if the museum object OR
                    # one of its attachments has been changed.
                    object_date_needs_update = (
                        not museum_object.modified_date
                        or museum_object.modified_date < attachment.modified_date
                    )

                    if object_date_needs_update:
                        museum_object.modified_date = attachment.modified_date

        results = []

        print(
            f"Updated, {inserts} inserts, {updates} "
            f"updates. Updating from offset: {index}"
        )

        # Submit heartbeat after each successful iteration instead of once
        # at the end. This is because this script is designed to be stopped
        # before it has finished iterating everything.
        submit_heartbeat(HeartbeatSource.SYNC_ATTACHMENTS)

        if save_progress:
            update_offset("sync_attachments", offset=index)

        if all_iterated:
            if save_progress:
                finish_sync_progress("sync_attachments")

            break

    await museum_session.close()
示例#15
0
def sync_hashes():
    """
    Update object entries with latest metadata hashes to determine which
    objects have been changed. This is done after 'sync_objects' and
    'sync_attachments'.
    """
    updated = 0
    skipped = 0
    total = 0

    with scoped_session() as db:
        query = iterate_museum_objects_and_attachments(db)

        all_iterated = False

        while True:
            results = []
            for i in range(0, CHUNK_SIZE):
                try:
                    results.append(next(query))
                except StopIteration:
                    all_iterated = True
                    break

            update_params = []

            for museum_object, museum_attachments in results:
                total += 1

                # Calculate the attachment metadata hash
                if museum_attachments:
                    # Don't calculate the hash if some attachments are
                    # incomplete
                    metadata_incomplete = any(attach.metadata_hash is None
                                              for attach in museum_attachments)

                    if metadata_incomplete:
                        skipped += 1
                        continue

                    attachment_metadata_hash = get_metadata_hash_for_attachments(
                        museum_attachments)
                else:
                    attachment_metadata_hash = ""

                if museum_object.attachment_metadata_hash \
                        == attachment_metadata_hash:
                    # Attachment hash hasn't changed, no need to update
                    continue

                updated += 1

                update_params.append({
                    "_id":
                    museum_object.id,
                    "_attachment_metadata_hash":
                    attachment_metadata_hash
                })

            if update_params:
                update_stmt = (MuseumObject.__table__.update().where(
                    MuseumObject.id == bindparam("_id")).values({
                        "attachment_metadata_hash":
                        bindparam("_attachment_metadata_hash")
                    }))
                db.execute(update_stmt, update_params)

            print(f"{total} iterated, {updated} updated and {skipped} skipped "
                  "so far")

            if all_iterated:
                break

    submit_heartbeat(HeartbeatSource.SYNC_HASHES)
示例#16
0
def freeze_objects(object_ids, reason, source, delete_jobs=True):
    """
    Freeze objects to prevent them from being included in the preservation
    workflow

    :returns: (freeze_count, cancel_count) tuple for how many objects were
              frozen and how many packages were cancelled as a result
    """
    object_ids = [int(object_id) for object_id in object_ids]
    source = FreezeSource(source)

    with lock_queues():
        # Are there object IDs that we're about to freeze but that are
        # still running?
        running_object_ids = get_running_object_ids()
        conflicting_object_ids = set(object_ids) & set(running_object_ids)

        if conflicting_object_ids:
            raise WorkflowJobRunningError(
                "The following object IDs have running jobs and can't be "
                f"frozen: {', '.join([str(o) for o in sorted(conflicting_object_ids)])}"
            )

        connect_db()
        with scoped_session() as db:
            freeze_count = (db.query(MuseumObject).filter(
                MuseumObject.id.in_(object_ids)).update(
                    {
                        MuseumObject.frozen: True,
                        MuseumObject.freeze_reason: reason,
                        MuseumObject.freeze_source: source
                    },
                    synchronize_session=False))

            packages_to_cancel = list(
                db.query(MuseumPackage).join(
                    MuseumObject,
                    MuseumObject.latest_package_id == MuseumPackage.id).filter(
                        MuseumPackage.museum_object_id.in_(object_ids),
                        MuseumPackage.preserved == False,
                        MuseumPackage.rejected == False,
                        MuseumPackage.cancelled == False))

            for package in packages_to_cancel:
                package.cancelled = True

                try:
                    museum_package = MuseumObjectPackage.from_path_sync(
                        Path(PACKAGE_DIR) / str(package.museum_object_id),
                        sip_id=package.sip_id)
                    museum_package.copy_log_files_to_archive(ARCHIVE_DIR)
                except FileNotFoundError:
                    # If the SIP doesn't exist, just skip it
                    pass

            # Cancel any jobs for each object ID if enabled
            if delete_jobs:
                for object_id in object_ids:
                    delete_jobs_for_object_id(object_id)

                    # Delete the museum package directory
                    try:
                        shutil.rmtree(Path(PACKAGE_DIR) / str(object_id))
                    except OSError:
                        # Directory does not exist
                        pass

        return freeze_count, len(packages_to_cancel)
示例#17
0
def download_object(object_id):
    """
    Download an object from MuseumPlus and enqueue the task 'create_sip'
    once the object is downloaded
    """
    object_id = int(object_id)
    connect_db()

    # Create a SIP id from the current time
    sip_id = datetime.datetime.now(
        datetime.timezone.utc).strftime("%Y%m%d-%H%M%S")

    try:
        museum_package = main(
            object_id=int(object_id),
            package_dir=PACKAGE_DIR,
            # 'sip_id' is optional, but giving it as a kwarg ensures the
            # filename of the SIP is correct before it is created.
            sip_id=sip_id)
    except PreservationError as exc:
        # If a PreservationError was raised, freeze the object
        freeze_running_object(object_id=object_id,
                              sip_id=sip_id,
                              freeze_reason=exc.error)
        return
    except OSError as exc:
        if exc.errno == errno.ENOSPC:
            raise OSError(
                errno.ENOSPC,
                "Ran out of disk space. This may have happened because the "
                "package directory ran out of space while downloading a "
                "large attachment. Try removing packages from the directory "
                "and trying again by processing less packages at the same "
                "time.")

        raise

    filename = museum_package.sip_filename

    with scoped_session() as db:
        db_museum_object = db.query(MuseumObject).filter(
            MuseumObject.id == object_id).one()

        db_package = db.query(MuseumPackage).filter_by(
            sip_filename=filename).first()

        # Get the attachments that currently exist for this object
        # and add them to the new MuseumPackage
        attachment_ids = museum_package.museum_object.attachment_ids
        db_attachments = bulk_create_or_get(db, MuseumAttachment,
                                            attachment_ids)

        if not db_package:
            db_package = MuseumPackage(
                sip_filename=filename,
                sip_id=sip_id,
                object_modified_date=(
                    museum_package.museum_object.modified_date),
                downloaded=True,
                metadata_hash=db_museum_object.metadata_hash,
                attachment_metadata_hash=(
                    db_museum_object.attachment_metadata_hash),
                attachments=db_attachments)
            db_package.museum_object = db_museum_object
        else:
            raise EnvironmentError(
                f"Package with filename {filename} already exists")

        db_museum_object.latest_package = db_package

        queue = get_queue(QueueType.CREATE_SIP)
        queue.enqueue(create_sip,
                      kwargs={
                          "object_id": object_id,
                          "sip_id": sip_id
                      },
                      job_id=f"create_sip_{object_id}")
示例#18
0
async def sync_objects(offset=0, limit=None, save_progress=False):
    """
    Synchronize object metadata from MuseumPlus to determine which
    objects have changed and need to be updated in the DPRES service. This
    is followed by 'sync_hashes'.

    :param int offset: Offset to start synchronizing from
    :param int limit: How many objects to sync before stopping.
        Default is None, meaning all available objects are synchronized.
    :param bool save_progress: Whether to save synchronization progress
                               and continue from the last run. Offset and limit
                               are ignored if enabled.
    """
    modify_date_gte = None

    if save_progress:
        limit = None

        sync_status = get_sync_status("sync_objects")
        offset = sync_status.offset
        # Start synchronization from objects that changed since the last
        # sync
        modify_date_gte = sync_status.prev_start_sync_date
        print(f"Continuing synchronization from {offset}")

    museum_session = await get_museum_session()
    object_iter = iterate_objects(session=museum_session,
                                  offset=offset,
                                  modify_date_gte=modify_date_gte)
    all_iterated = False
    index = offset
    processed = 0

    while True:
        results = []

        all_iterated = True
        async for result in object_iter:
            all_iterated = False
            results.append(result)
            index += 1

            if len(results) >= CHUNK_SIZE:
                break

        objects = {result["id"]: result for result in results}
        object_ids = list(objects.keys())

        inserts, updates = 0, 0

        with scoped_session() as db:
            existing_object_ids = set([
                result.id for result in db.query(MuseumObject).options(
                    load_only("id")).filter(MuseumObject.id.in_(object_ids))
            ])

            object_id2attachment_id = defaultdict(set)
            attachment_ids = set()

            update_params = []

            # Create existing objects, update the rest
            for result in objects.values():
                object_id = int(result["id"])
                title = result["title"]
                modified_date = result["modified_date"]
                created_date = result["created_date"]
                multimedia_ids = result["multimedia_ids"]
                xml_hash = result["xml_hash"]

                object_id2attachment_id[object_id].update(multimedia_ids)
                attachment_ids.update(multimedia_ids)

                if object_id in existing_object_ids:
                    # Don't run the update query instantly; instead,
                    # set the parameters and run them all together later
                    # in bulk
                    update_params.append({
                        "_id": object_id,
                        "_title": title,
                        "_modified_date": modified_date,
                        "_metadata_hash": xml_hash
                    })
                    updates += 1
                else:
                    # Create
                    mus_object = MuseumObject(id=object_id,
                                              title=title,
                                              modified_date=modified_date,
                                              created_date=created_date,
                                              metadata_hash=xml_hash)
                    db.add(mus_object)
                    inserts += 1

                processed += 1

                if limit is not None and processed == limit:
                    all_iterated = True
                    break

            if update_params:
                # Perform updates in bulk
                stmt_a = (MuseumObject.__table__.update().where(
                    MuseumObject.id == bindparam("_id")).values({
                        "title":
                        bindparam("_title"),
                        "metadata_hash":
                        bindparam("_metadata_hash")
                    }))
                stmt_b = (MuseumObject.__table__.update().where(
                    and_(
                        MuseumObject.id == bindparam("_id"),
                        or_(
                            MuseumObject.modified_date == None,
                            MuseumObject.modified_date <
                            bindparam("_modified_date")))).values(
                                {"modified_date":
                                 bindparam("_modified_date")}))
                db.execute(stmt_a, update_params)
                db.execute(stmt_b, update_params)

            # Create/update MuseumAttachments with references
            # to the newly updated MuseumObjects.
            # For performance reasons update references for a batch
            # of objects at once
            objects = (db.query(MuseumObject).filter(
                MuseumObject.id.in_(object_ids)))
            attachments = bulk_create_or_get(db, MuseumAttachment,
                                             attachment_ids)
            attachments_by_id = {
                attachment.id: attachment
                for attachment in attachments
            }

            for museum_object in objects:
                museum_object.attachments = [
                    attachments_by_id[attachment_id] for attachment_id in
                    object_id2attachment_id[museum_object.id]
                ]

        results = []

        print(f"Updated, {inserts} inserts, {updates} "
              f"updates. Updating from offset: {index}")

        # Submit heartbeat after each successful iteration instead of once
        # at the end. This is because this script is designed to be stopped
        # before it has finished iterating everything.
        submit_heartbeat(HeartbeatSource.SYNC_OBJECTS)

        if save_progress:
            update_offset("sync_objects", offset=index)

        if all_iterated:
            if save_progress:
                finish_sync_progress("sync_objects")

            break

    await museum_session.close()