예제 #1
0
def sync_processed_sips(days):
    """
    Synchronize processed SIPs from the DPRES service, mark the corresponding
    packages as either preserved or rejected and cleanup the remaining files
    """
    connect_db()

    confirmed_sip_filenames = get_confirmed_sip_filenames(days)

    with connect_dpres_sftp() as sftp:
        accepted_sips = get_processed_sips(
            sftp,
            status="accepted",
            days=days,
            confirmed_sip_filenames=confirmed_sip_filenames)
        print(f"Found {len(accepted_sips)} accepted SIPs")

        rejected_sips = get_processed_sips(
            sftp,
            status="rejected",
            days=days,
            confirmed_sip_filenames=confirmed_sip_filenames)
        print(f"Found {len(rejected_sips)} rejected SIPs")

        completed_sips = combine_results(accepted_sips, rejected_sips)

        update_sips(completed_sips, sftp=sftp)

        submit_heartbeat(HeartbeatSource.SYNC_PROCESSED_SIPS)
예제 #2
0
def cli(offset, limit, save_progress):
    connect_db()

    loop = asyncio.get_event_loop()
    loop.run_until_complete(
        sync_attachments(
            offset=offset, limit=limit, save_progress=save_progress
        )
    )
def unfreeze_objects(reason=None, object_ids=None, enqueue=False):
    """
    Unfreeze objects with the given reason and/or object IDs.

    This allows them to be preserved again.

    :param str reason: Unfreeze objects with this reason
    :param list object_ids: Objects to unfreeze.
    :param bool enqueue: Whether to enqueue the unfrozen objects immediately.
                         Default is False.
    """
    connect_db()

    if not reason and not object_ids:
        raise ValueError("Either 'reason' or 'object_ids' has to be provided")

    with lock_queues():
        with scoped_session() as db:
            query = (
                db.query(MuseumObject)
                .outerjoin(
                    MuseumPackage,
                    MuseumPackage.id == MuseumObject.latest_package_id
                )
                .filter(MuseumObject.frozen == True)
            )

            if reason:
                query = query.filter(MuseumObject.freeze_reason == reason)
            if object_ids:
                object_ids = [int(object_id) for object_id in object_ids]
                query = query.filter(MuseumObject.id.in_(object_ids))

            museum_objects = list(query)
            for museum_object in museum_objects:
                museum_object.frozen = False
                museum_object.freeze_reason = None
                museum_object.freeze_source = None

                # Remove the latest package if it was *not* successfully
                # preserved to ensure the object is eligible for preservation
                remove_latest_package = (
                    museum_object.latest_package
                    and not museum_object.latest_package.preserved
                )

                if remove_latest_package:
                    museum_object.latest_package = None

                if enqueue:
                    enqueue_object(object_id=museum_object.id)

            return len(museum_objects)
def cli():
    """
    Start a REPL session with active DB session and DB models
    """
    connect_db()
    db = DBSession()

    console = code.InteractiveConsole(locals={"db": db})
    console.runsource("from passari_workflow.db.models import *")
    console.interact(
        "SQLAlchemy database session (`db`) and Passari models are "
        "available in this console.\n"
        "\n"
        "For example, you can run the following command:\n"
        "> non_preserved_objects = "
        "db.query(MuseumObject).filter_by(preserved=False)"
    )
def enqueue_objects(object_count, random=False, object_ids=None):
    """
    Enqueue given number of objects to the preservation workflow.

    :param int object_count: How many objects to enqueue at most
    :param bool random: Whether to enqueue objects at random instead
                        of in-order.
    :param list object_ids: Object IDs to enqueue. If provided, 'object_count'
                            and 'random' are ignored.
    """
    if object_ids:
        object_count = len(object_ids)

    with lock_queues():
        connect_db()
        enqueued_object_ids = get_enqueued_object_ids()

        new_job_count = 0

        with scoped_session() as db:
            object_query = (db.query(MuseumObject).with_transformation(
                MuseumObject.filter_preservation_pending).yield_per(500))

            if object_ids:
                object_query = object_query.filter(
                    MuseumObject.id.in_(object_ids))

            if random:
                object_query = object_query.order_by(func.random())

            for museum_object in object_query:
                if museum_object.id not in enqueued_object_ids:
                    enqueue_object(museum_object.id)
                    new_job_count += 1
                    print(f"Enqueued download_object_{museum_object.id}")

                if new_job_count >= object_count:
                    break

    print(f"{new_job_count} object(s) enqueued for download")

    return new_job_count
def reenqueue_object(object_id: int):
    """
    Re-enqueue rejected object into the workflow
    """
    object_id = int(object_id)
    connect_db()

    queue = get_queue(QueueType.DOWNLOAD_OBJECT)

    with scoped_session() as db:
        museum_object = (
            db.query(MuseumObject)
            .join(
                MuseumPackage,
                MuseumObject.latest_package_id == MuseumPackage.id
            )
            .filter(MuseumObject.id == object_id)
            .one()
        )

        if museum_object.latest_package and \
                not museum_object.latest_package.rejected:
            raise ValueError(
                f"Latest package {museum_object.latest_package.sip_filename} "
                f"wasn't rejected"
            )

        object_ids = get_enqueued_object_ids()

        if object_id in object_ids:
            raise ValueError(
                f"Object is still in the workflow and can't be re-enqueued"
            )

        museum_object.latest_package = None

        delete_jobs_for_object_id(object_id)

        queue.enqueue(
            download_object, kwargs={"object_id": object_id},
            job_id=f"download_object_{object_id}"
        )
예제 #7
0
def confirm_sip(object_id, sip_id):
    """
    Confirm SIP that was either preserved or rejected by the DPRES service.
    This is the last step in the preservation workflow.
    """
    object_id = int(object_id)
    connect_db()

    package_dir = Path(PACKAGE_DIR) / str(object_id)

    museum_package = MuseumObjectPackage.from_path_sync(package_dir,
                                                        sip_id=sip_id)
    # '.status' file contains either the text 'accepted' or 'rejected'
    status = (museum_package.path /
              f"{museum_package.sip_filename}.status").read_text()

    if status not in ("accepted", "rejected"):
        raise ValueError(f"Invalid preservation status: {status}")

    print(f"Confirming SIP {museum_package.sip_filename}")
    main(object_id=object_id,
         package_dir=PACKAGE_DIR,
         archive_dir=ARCHIVE_DIR,
         sip_id=sip_id,
         status=status)

    with scoped_session() as db:
        db.query(MuseumPackage).filter_by(
            sip_filename=museum_package.sip_filename).update({
                MuseumPackage.preserved:
                bool(status == "accepted"),
                MuseumPackage.rejected:
                bool(status == "rejected")
            })

        if status == "accepted":
            db.query(MuseumObject).filter_by(id=object_id).update(
                {MuseumObject.preserved: True})

    print(f"SIP {museum_package.sip_filename} confirmed")
def submit_sip(object_id, sip_id):
    """
    Submit SIP to the DPRES service.

    The next workflow task will be enqueued by 'sync_processed_sips' which
    periodically checks the processed SIPs
    """
    object_id = int(object_id)
    connect_db()

    package_dir = Path(PACKAGE_DIR) / str(object_id)

    # Retrieve the latest SIP filename
    museum_package = MuseumObjectPackage.from_path_sync(package_dir,
                                                        sip_id=sip_id)
    filename = museum_package.sip_filename

    with scoped_session() as db:
        package_uploaded = db.query(exists().where(
            and_(MuseumPackage.sip_filename == museum_package.sip_filename,
                 MuseumPackage.uploaded == True))).scalar()
        if package_uploaded:
            raise RuntimeError(f"Package {filename} already uploaded")

    print(f"Submitting {filename} for Object {object_id}")

    museum_package = main(object_id=object_id,
                          package_dir=PACKAGE_DIR,
                          sip_id=sip_id)

    print(f"Package {filename} submitted, removing local file")

    with scoped_session() as db:
        db_museum_package = db.query(MuseumPackage).filter_by(
            sip_filename=museum_package.sip_filename).one()
        db_museum_package.uploaded = True

    # Delete the generated SIP to free space
    os.remove(museum_package.sip_archive_path)
def reset_workflow():
    """
    Reset workflow after a PostgreSQL backup restoration by removing in-process
    packages that were not submitted to the DPRES service but were still
    in the workflow at the time the backup was initiated.
    """
    with lock_queues():
        connect_db()

        with scoped_session() as db:
            # Get objects that have been downloaded or packaged, but which
            # haven't been uploaded yet
            objects = (db.query(MuseumObject).join(
                MuseumPackage,
                MuseumPackage.id == MuseumObject.latest_package_id).filter(
                    MuseumPackage.uploaded == False,
                    or_(MuseumPackage.downloaded, MuseumPackage.packaged)))
            objects = list(objects)

            print(f"Found {len(objects)} dangling objects")

            for mus_object in objects:
                mus_package = mus_object.latest_package

                # Remove the lingering package from the MuseumObject to make
                # the object eligible for preservation again.
                mus_object.latest_package = None
                db.delete(mus_package)

                try:
                    shutil.rmtree(Path(PACKAGE_DIR) / str(mus_object.id))
                except OSError:
                    # Directory does not exist; ignore
                    pass

    print("Done!")
def engine(database, monkeypatch):
    monkeypatch.setitem(CONFIG["db"], "user", database.user)
    monkeypatch.setitem(
        CONFIG["db"],
        "password",
        # Password authentication is used when running tests under Docker
        os.environ.get("POSTGRES_PASSWORD", ""))
    monkeypatch.setitem(CONFIG["db"], "host", database.host)
    monkeypatch.setitem(CONFIG["db"], "port", database.port)
    monkeypatch.setitem(CONFIG["db"], "name", "passari_test")

    engine = connect_db()
    engine.echo = True

    # pg_trgm extension must exist
    engine.execute("CREATE EXTENSION IF NOT EXISTS pg_trgm")

    Base.metadata.create_all(engine)
    yield engine
    Base.metadata.drop_all(engine)
def engine(database, monkeypatch):
    """
    Fixture for creating an empty database on each test run
    """
    monkeypatch.setitem(WORKFLOW_CONFIG["db"], "user", database.user)
    monkeypatch.setitem(
        WORKFLOW_CONFIG["db"],
        "password",
        # Password authentication is used when running tests using Docker
        os.environ.get("POSTGRES_PASSWORD", ""))
    monkeypatch.setitem(WORKFLOW_CONFIG["db"], "host", database.host)
    monkeypatch.setitem(WORKFLOW_CONFIG["db"], "port", database.port)
    monkeypatch.setitem(WORKFLOW_CONFIG["db"], "name", "passari_test")

    engine = connect_db()

    # pg_trgm extension must exist
    engine.execute("CREATE EXTENSION IF NOT EXISTS pg_trgm")

    Base.metadata.create_all(engine)
    AuthBase.metadata.create_all(engine)
    yield engine
    Base.metadata.drop_all(engine)
    AuthBase.metadata.drop_all(engine)
예제 #12
0
def download_object(object_id):
    """
    Download an object from MuseumPlus and enqueue the task 'create_sip'
    once the object is downloaded
    """
    object_id = int(object_id)
    connect_db()

    # Create a SIP id from the current time
    sip_id = datetime.datetime.now(
        datetime.timezone.utc).strftime("%Y%m%d-%H%M%S")

    try:
        museum_package = main(
            object_id=int(object_id),
            package_dir=PACKAGE_DIR,
            # 'sip_id' is optional, but giving it as a kwarg ensures the
            # filename of the SIP is correct before it is created.
            sip_id=sip_id)
    except PreservationError as exc:
        # If a PreservationError was raised, freeze the object
        freeze_running_object(object_id=object_id,
                              sip_id=sip_id,
                              freeze_reason=exc.error)
        return
    except OSError as exc:
        if exc.errno == errno.ENOSPC:
            raise OSError(
                errno.ENOSPC,
                "Ran out of disk space. This may have happened because the "
                "package directory ran out of space while downloading a "
                "large attachment. Try removing packages from the directory "
                "and trying again by processing less packages at the same "
                "time.")

        raise

    filename = museum_package.sip_filename

    with scoped_session() as db:
        db_museum_object = db.query(MuseumObject).filter(
            MuseumObject.id == object_id).one()

        db_package = db.query(MuseumPackage).filter_by(
            sip_filename=filename).first()

        # Get the attachments that currently exist for this object
        # and add them to the new MuseumPackage
        attachment_ids = museum_package.museum_object.attachment_ids
        db_attachments = bulk_create_or_get(db, MuseumAttachment,
                                            attachment_ids)

        if not db_package:
            db_package = MuseumPackage(
                sip_filename=filename,
                sip_id=sip_id,
                object_modified_date=(
                    museum_package.museum_object.modified_date),
                downloaded=True,
                metadata_hash=db_museum_object.metadata_hash,
                attachment_metadata_hash=(
                    db_museum_object.attachment_metadata_hash),
                attachments=db_attachments)
            db_package.museum_object = db_museum_object
        else:
            raise EnvironmentError(
                f"Package with filename {filename} already exists")

        db_museum_object.latest_package = db_package

        queue = get_queue(QueueType.CREATE_SIP)
        queue.enqueue(create_sip,
                      kwargs={
                          "object_id": object_id,
                          "sip_id": sip_id
                      },
                      job_id=f"create_sip_{object_id}")
예제 #13
0
def create_sip(object_id, sip_id):
    """
    Create SIP from a downloaded objec and enqueue the task 'submit_sip'
    once the object is packaged into a SIP
    """
    object_id = int(object_id)
    connect_db()

    # Are we creating a SIP for the first time or updating a preserved
    # package?
    created_date, modified_date = None, None
    with scoped_session() as db:
        last_preserved_package = (
            db.query(MuseumPackage)
            .filter(MuseumPackage.museum_object_id == object_id)
            .filter(MuseumPackage.preserved == True)
            .order_by(MuseumPackage.created_date.desc())
            .first()
        )
        current_package = (
            db.query(MuseumObject)
            .join(
                MuseumPackage,
                MuseumObject.latest_package_id == MuseumPackage.id
            )
            .filter(MuseumObject.id == object_id)
            .one()
            .latest_package
        )

        if not last_preserved_package:
            # We haven't created a preserved SIP yet
            print(f"Creating submission SIP for Object {object_id}")
            created_date = current_package.created_date
        else:
            # We are updating an existing package
            print(f"Creating update SIP for Object {object_id}")
            created_date = last_preserved_package.created_date
            modified_date = current_package.created_date

    # Run the 'create_sip' script
    try:
        museum_package = main(
            object_id=object_id, package_dir=PACKAGE_DIR, sip_id=sip_id,
            create_date=created_date, modify_date=modified_date,
            update=bool(modified_date)
        )
    except PreservationError as exc:
        # If a PreservationError was raised, freeze the object and prevent
        # the object from going further in the workflow.
        freeze_running_object(
            object_id=object_id,
            sip_id=sip_id,
            freeze_reason=exc.error
        )
        return
    except OSError as exc:
        if exc.errno == errno.ENOSPC:
            raise OSError(
                errno.ENOSPC,
                "Ran out of disk space. This may have happened because the "
                "package directory ran out of space while downloading a "
                "large attachment. Try removing packages from the directory "
                "and trying again by processing less packages at the same "
                "time."
            )

        raise

    filename = museum_package.sip_filename

    print(f"Created SIP for Object {object_id}, updating database")

    with scoped_session() as db:
        db_package = db.query(MuseumPackage).filter(
            MuseumPackage.sip_filename == filename
        ).one()
        db_package.packaged = True
        db.query(MuseumObject).filter(
            MuseumObject.id == object_id
        ).update({MuseumObject.latest_package_id: db_package.id})

        queue = get_queue(QueueType.SUBMIT_SIP)
        queue.enqueue(
            submit_sip, kwargs={"object_id": object_id, "sip_id": sip_id},
            job_id=f"submit_sip_{object_id}"
        )
예제 #14
0
def cli():
    connect_db()
    sync_hashes()
예제 #15
0
def freeze_objects(object_ids, reason, source, delete_jobs=True):
    """
    Freeze objects to prevent them from being included in the preservation
    workflow

    :returns: (freeze_count, cancel_count) tuple for how many objects were
              frozen and how many packages were cancelled as a result
    """
    object_ids = [int(object_id) for object_id in object_ids]
    source = FreezeSource(source)

    with lock_queues():
        # Are there object IDs that we're about to freeze but that are
        # still running?
        running_object_ids = get_running_object_ids()
        conflicting_object_ids = set(object_ids) & set(running_object_ids)

        if conflicting_object_ids:
            raise WorkflowJobRunningError(
                "The following object IDs have running jobs and can't be "
                f"frozen: {', '.join([str(o) for o in sorted(conflicting_object_ids)])}"
            )

        connect_db()
        with scoped_session() as db:
            freeze_count = (db.query(MuseumObject).filter(
                MuseumObject.id.in_(object_ids)).update(
                    {
                        MuseumObject.frozen: True,
                        MuseumObject.freeze_reason: reason,
                        MuseumObject.freeze_source: source
                    },
                    synchronize_session=False))

            packages_to_cancel = list(
                db.query(MuseumPackage).join(
                    MuseumObject,
                    MuseumObject.latest_package_id == MuseumPackage.id).filter(
                        MuseumPackage.museum_object_id.in_(object_ids),
                        MuseumPackage.preserved == False,
                        MuseumPackage.rejected == False,
                        MuseumPackage.cancelled == False))

            for package in packages_to_cancel:
                package.cancelled = True

                try:
                    museum_package = MuseumObjectPackage.from_path_sync(
                        Path(PACKAGE_DIR) / str(package.museum_object_id),
                        sip_id=package.sip_id)
                    museum_package.copy_log_files_to_archive(ARCHIVE_DIR)
                except FileNotFoundError:
                    # If the SIP doesn't exist, just skip it
                    pass

            # Cancel any jobs for each object ID if enabled
            if delete_jobs:
                for object_id in object_ids:
                    delete_jobs_for_object_id(object_id)

                    # Delete the museum package directory
                    try:
                        shutil.rmtree(Path(PACKAGE_DIR) / str(object_id))
                    except OSError:
                        # Directory does not exist
                        pass

        return freeze_count, len(packages_to_cancel)