def sync_processed_sips(days): """ Synchronize processed SIPs from the DPRES service, mark the corresponding packages as either preserved or rejected and cleanup the remaining files """ connect_db() confirmed_sip_filenames = get_confirmed_sip_filenames(days) with connect_dpres_sftp() as sftp: accepted_sips = get_processed_sips( sftp, status="accepted", days=days, confirmed_sip_filenames=confirmed_sip_filenames) print(f"Found {len(accepted_sips)} accepted SIPs") rejected_sips = get_processed_sips( sftp, status="rejected", days=days, confirmed_sip_filenames=confirmed_sip_filenames) print(f"Found {len(rejected_sips)} rejected SIPs") completed_sips = combine_results(accepted_sips, rejected_sips) update_sips(completed_sips, sftp=sftp) submit_heartbeat(HeartbeatSource.SYNC_PROCESSED_SIPS)
def cli(offset, limit, save_progress): connect_db() loop = asyncio.get_event_loop() loop.run_until_complete( sync_attachments( offset=offset, limit=limit, save_progress=save_progress ) )
def unfreeze_objects(reason=None, object_ids=None, enqueue=False): """ Unfreeze objects with the given reason and/or object IDs. This allows them to be preserved again. :param str reason: Unfreeze objects with this reason :param list object_ids: Objects to unfreeze. :param bool enqueue: Whether to enqueue the unfrozen objects immediately. Default is False. """ connect_db() if not reason and not object_ids: raise ValueError("Either 'reason' or 'object_ids' has to be provided") with lock_queues(): with scoped_session() as db: query = ( db.query(MuseumObject) .outerjoin( MuseumPackage, MuseumPackage.id == MuseumObject.latest_package_id ) .filter(MuseumObject.frozen == True) ) if reason: query = query.filter(MuseumObject.freeze_reason == reason) if object_ids: object_ids = [int(object_id) for object_id in object_ids] query = query.filter(MuseumObject.id.in_(object_ids)) museum_objects = list(query) for museum_object in museum_objects: museum_object.frozen = False museum_object.freeze_reason = None museum_object.freeze_source = None # Remove the latest package if it was *not* successfully # preserved to ensure the object is eligible for preservation remove_latest_package = ( museum_object.latest_package and not museum_object.latest_package.preserved ) if remove_latest_package: museum_object.latest_package = None if enqueue: enqueue_object(object_id=museum_object.id) return len(museum_objects)
def cli(): """ Start a REPL session with active DB session and DB models """ connect_db() db = DBSession() console = code.InteractiveConsole(locals={"db": db}) console.runsource("from passari_workflow.db.models import *") console.interact( "SQLAlchemy database session (`db`) and Passari models are " "available in this console.\n" "\n" "For example, you can run the following command:\n" "> non_preserved_objects = " "db.query(MuseumObject).filter_by(preserved=False)" )
def enqueue_objects(object_count, random=False, object_ids=None): """ Enqueue given number of objects to the preservation workflow. :param int object_count: How many objects to enqueue at most :param bool random: Whether to enqueue objects at random instead of in-order. :param list object_ids: Object IDs to enqueue. If provided, 'object_count' and 'random' are ignored. """ if object_ids: object_count = len(object_ids) with lock_queues(): connect_db() enqueued_object_ids = get_enqueued_object_ids() new_job_count = 0 with scoped_session() as db: object_query = (db.query(MuseumObject).with_transformation( MuseumObject.filter_preservation_pending).yield_per(500)) if object_ids: object_query = object_query.filter( MuseumObject.id.in_(object_ids)) if random: object_query = object_query.order_by(func.random()) for museum_object in object_query: if museum_object.id not in enqueued_object_ids: enqueue_object(museum_object.id) new_job_count += 1 print(f"Enqueued download_object_{museum_object.id}") if new_job_count >= object_count: break print(f"{new_job_count} object(s) enqueued for download") return new_job_count
def reenqueue_object(object_id: int): """ Re-enqueue rejected object into the workflow """ object_id = int(object_id) connect_db() queue = get_queue(QueueType.DOWNLOAD_OBJECT) with scoped_session() as db: museum_object = ( db.query(MuseumObject) .join( MuseumPackage, MuseumObject.latest_package_id == MuseumPackage.id ) .filter(MuseumObject.id == object_id) .one() ) if museum_object.latest_package and \ not museum_object.latest_package.rejected: raise ValueError( f"Latest package {museum_object.latest_package.sip_filename} " f"wasn't rejected" ) object_ids = get_enqueued_object_ids() if object_id in object_ids: raise ValueError( f"Object is still in the workflow and can't be re-enqueued" ) museum_object.latest_package = None delete_jobs_for_object_id(object_id) queue.enqueue( download_object, kwargs={"object_id": object_id}, job_id=f"download_object_{object_id}" )
def confirm_sip(object_id, sip_id): """ Confirm SIP that was either preserved or rejected by the DPRES service. This is the last step in the preservation workflow. """ object_id = int(object_id) connect_db() package_dir = Path(PACKAGE_DIR) / str(object_id) museum_package = MuseumObjectPackage.from_path_sync(package_dir, sip_id=sip_id) # '.status' file contains either the text 'accepted' or 'rejected' status = (museum_package.path / f"{museum_package.sip_filename}.status").read_text() if status not in ("accepted", "rejected"): raise ValueError(f"Invalid preservation status: {status}") print(f"Confirming SIP {museum_package.sip_filename}") main(object_id=object_id, package_dir=PACKAGE_DIR, archive_dir=ARCHIVE_DIR, sip_id=sip_id, status=status) with scoped_session() as db: db.query(MuseumPackage).filter_by( sip_filename=museum_package.sip_filename).update({ MuseumPackage.preserved: bool(status == "accepted"), MuseumPackage.rejected: bool(status == "rejected") }) if status == "accepted": db.query(MuseumObject).filter_by(id=object_id).update( {MuseumObject.preserved: True}) print(f"SIP {museum_package.sip_filename} confirmed")
def submit_sip(object_id, sip_id): """ Submit SIP to the DPRES service. The next workflow task will be enqueued by 'sync_processed_sips' which periodically checks the processed SIPs """ object_id = int(object_id) connect_db() package_dir = Path(PACKAGE_DIR) / str(object_id) # Retrieve the latest SIP filename museum_package = MuseumObjectPackage.from_path_sync(package_dir, sip_id=sip_id) filename = museum_package.sip_filename with scoped_session() as db: package_uploaded = db.query(exists().where( and_(MuseumPackage.sip_filename == museum_package.sip_filename, MuseumPackage.uploaded == True))).scalar() if package_uploaded: raise RuntimeError(f"Package {filename} already uploaded") print(f"Submitting {filename} for Object {object_id}") museum_package = main(object_id=object_id, package_dir=PACKAGE_DIR, sip_id=sip_id) print(f"Package {filename} submitted, removing local file") with scoped_session() as db: db_museum_package = db.query(MuseumPackage).filter_by( sip_filename=museum_package.sip_filename).one() db_museum_package.uploaded = True # Delete the generated SIP to free space os.remove(museum_package.sip_archive_path)
def reset_workflow(): """ Reset workflow after a PostgreSQL backup restoration by removing in-process packages that were not submitted to the DPRES service but were still in the workflow at the time the backup was initiated. """ with lock_queues(): connect_db() with scoped_session() as db: # Get objects that have been downloaded or packaged, but which # haven't been uploaded yet objects = (db.query(MuseumObject).join( MuseumPackage, MuseumPackage.id == MuseumObject.latest_package_id).filter( MuseumPackage.uploaded == False, or_(MuseumPackage.downloaded, MuseumPackage.packaged))) objects = list(objects) print(f"Found {len(objects)} dangling objects") for mus_object in objects: mus_package = mus_object.latest_package # Remove the lingering package from the MuseumObject to make # the object eligible for preservation again. mus_object.latest_package = None db.delete(mus_package) try: shutil.rmtree(Path(PACKAGE_DIR) / str(mus_object.id)) except OSError: # Directory does not exist; ignore pass print("Done!")
def engine(database, monkeypatch): monkeypatch.setitem(CONFIG["db"], "user", database.user) monkeypatch.setitem( CONFIG["db"], "password", # Password authentication is used when running tests under Docker os.environ.get("POSTGRES_PASSWORD", "")) monkeypatch.setitem(CONFIG["db"], "host", database.host) monkeypatch.setitem(CONFIG["db"], "port", database.port) monkeypatch.setitem(CONFIG["db"], "name", "passari_test") engine = connect_db() engine.echo = True # pg_trgm extension must exist engine.execute("CREATE EXTENSION IF NOT EXISTS pg_trgm") Base.metadata.create_all(engine) yield engine Base.metadata.drop_all(engine)
def engine(database, monkeypatch): """ Fixture for creating an empty database on each test run """ monkeypatch.setitem(WORKFLOW_CONFIG["db"], "user", database.user) monkeypatch.setitem( WORKFLOW_CONFIG["db"], "password", # Password authentication is used when running tests using Docker os.environ.get("POSTGRES_PASSWORD", "")) monkeypatch.setitem(WORKFLOW_CONFIG["db"], "host", database.host) monkeypatch.setitem(WORKFLOW_CONFIG["db"], "port", database.port) monkeypatch.setitem(WORKFLOW_CONFIG["db"], "name", "passari_test") engine = connect_db() # pg_trgm extension must exist engine.execute("CREATE EXTENSION IF NOT EXISTS pg_trgm") Base.metadata.create_all(engine) AuthBase.metadata.create_all(engine) yield engine Base.metadata.drop_all(engine) AuthBase.metadata.drop_all(engine)
def download_object(object_id): """ Download an object from MuseumPlus and enqueue the task 'create_sip' once the object is downloaded """ object_id = int(object_id) connect_db() # Create a SIP id from the current time sip_id = datetime.datetime.now( datetime.timezone.utc).strftime("%Y%m%d-%H%M%S") try: museum_package = main( object_id=int(object_id), package_dir=PACKAGE_DIR, # 'sip_id' is optional, but giving it as a kwarg ensures the # filename of the SIP is correct before it is created. sip_id=sip_id) except PreservationError as exc: # If a PreservationError was raised, freeze the object freeze_running_object(object_id=object_id, sip_id=sip_id, freeze_reason=exc.error) return except OSError as exc: if exc.errno == errno.ENOSPC: raise OSError( errno.ENOSPC, "Ran out of disk space. This may have happened because the " "package directory ran out of space while downloading a " "large attachment. Try removing packages from the directory " "and trying again by processing less packages at the same " "time.") raise filename = museum_package.sip_filename with scoped_session() as db: db_museum_object = db.query(MuseumObject).filter( MuseumObject.id == object_id).one() db_package = db.query(MuseumPackage).filter_by( sip_filename=filename).first() # Get the attachments that currently exist for this object # and add them to the new MuseumPackage attachment_ids = museum_package.museum_object.attachment_ids db_attachments = bulk_create_or_get(db, MuseumAttachment, attachment_ids) if not db_package: db_package = MuseumPackage( sip_filename=filename, sip_id=sip_id, object_modified_date=( museum_package.museum_object.modified_date), downloaded=True, metadata_hash=db_museum_object.metadata_hash, attachment_metadata_hash=( db_museum_object.attachment_metadata_hash), attachments=db_attachments) db_package.museum_object = db_museum_object else: raise EnvironmentError( f"Package with filename {filename} already exists") db_museum_object.latest_package = db_package queue = get_queue(QueueType.CREATE_SIP) queue.enqueue(create_sip, kwargs={ "object_id": object_id, "sip_id": sip_id }, job_id=f"create_sip_{object_id}")
def create_sip(object_id, sip_id): """ Create SIP from a downloaded objec and enqueue the task 'submit_sip' once the object is packaged into a SIP """ object_id = int(object_id) connect_db() # Are we creating a SIP for the first time or updating a preserved # package? created_date, modified_date = None, None with scoped_session() as db: last_preserved_package = ( db.query(MuseumPackage) .filter(MuseumPackage.museum_object_id == object_id) .filter(MuseumPackage.preserved == True) .order_by(MuseumPackage.created_date.desc()) .first() ) current_package = ( db.query(MuseumObject) .join( MuseumPackage, MuseumObject.latest_package_id == MuseumPackage.id ) .filter(MuseumObject.id == object_id) .one() .latest_package ) if not last_preserved_package: # We haven't created a preserved SIP yet print(f"Creating submission SIP for Object {object_id}") created_date = current_package.created_date else: # We are updating an existing package print(f"Creating update SIP for Object {object_id}") created_date = last_preserved_package.created_date modified_date = current_package.created_date # Run the 'create_sip' script try: museum_package = main( object_id=object_id, package_dir=PACKAGE_DIR, sip_id=sip_id, create_date=created_date, modify_date=modified_date, update=bool(modified_date) ) except PreservationError as exc: # If a PreservationError was raised, freeze the object and prevent # the object from going further in the workflow. freeze_running_object( object_id=object_id, sip_id=sip_id, freeze_reason=exc.error ) return except OSError as exc: if exc.errno == errno.ENOSPC: raise OSError( errno.ENOSPC, "Ran out of disk space. This may have happened because the " "package directory ran out of space while downloading a " "large attachment. Try removing packages from the directory " "and trying again by processing less packages at the same " "time." ) raise filename = museum_package.sip_filename print(f"Created SIP for Object {object_id}, updating database") with scoped_session() as db: db_package = db.query(MuseumPackage).filter( MuseumPackage.sip_filename == filename ).one() db_package.packaged = True db.query(MuseumObject).filter( MuseumObject.id == object_id ).update({MuseumObject.latest_package_id: db_package.id}) queue = get_queue(QueueType.SUBMIT_SIP) queue.enqueue( submit_sip, kwargs={"object_id": object_id, "sip_id": sip_id}, job_id=f"submit_sip_{object_id}" )
def cli(): connect_db() sync_hashes()
def freeze_objects(object_ids, reason, source, delete_jobs=True): """ Freeze objects to prevent them from being included in the preservation workflow :returns: (freeze_count, cancel_count) tuple for how many objects were frozen and how many packages were cancelled as a result """ object_ids = [int(object_id) for object_id in object_ids] source = FreezeSource(source) with lock_queues(): # Are there object IDs that we're about to freeze but that are # still running? running_object_ids = get_running_object_ids() conflicting_object_ids = set(object_ids) & set(running_object_ids) if conflicting_object_ids: raise WorkflowJobRunningError( "The following object IDs have running jobs and can't be " f"frozen: {', '.join([str(o) for o in sorted(conflicting_object_ids)])}" ) connect_db() with scoped_session() as db: freeze_count = (db.query(MuseumObject).filter( MuseumObject.id.in_(object_ids)).update( { MuseumObject.frozen: True, MuseumObject.freeze_reason: reason, MuseumObject.freeze_source: source }, synchronize_session=False)) packages_to_cancel = list( db.query(MuseumPackage).join( MuseumObject, MuseumObject.latest_package_id == MuseumPackage.id).filter( MuseumPackage.museum_object_id.in_(object_ids), MuseumPackage.preserved == False, MuseumPackage.rejected == False, MuseumPackage.cancelled == False)) for package in packages_to_cancel: package.cancelled = True try: museum_package = MuseumObjectPackage.from_path_sync( Path(PACKAGE_DIR) / str(package.museum_object_id), sip_id=package.sip_id) museum_package.copy_log_files_to_archive(ARCHIVE_DIR) except FileNotFoundError: # If the SIP doesn't exist, just skip it pass # Cancel any jobs for each object ID if enabled if delete_jobs: for object_id in object_ids: delete_jobs_for_object_id(object_id) # Delete the museum package directory try: shutil.rmtree(Path(PACKAGE_DIR) / str(object_id)) except OSError: # Directory does not exist pass return freeze_count, len(packages_to_cancel)