def update_offset(name, offset): """ Update current offset to the database """ with scoped_session() as db: sync_status = _get_sync_status(db, name) sync_status.offset = offset
def freeze_running_object(object_id, sip_id, freeze_reason): """ Cancel and freeze a MuseumObject that is currently in the workflow, and mark the SIP as cancelled if one was created. """ with scoped_session() as db: museum_object = (db.query(MuseumObject).filter( MuseumObject.id == object_id).one()) museum_object.frozen = True museum_object.freeze_reason = freeze_reason museum_object.freeze_source = FreezeSource.AUTOMATIC is_same_package = (museum_object.latest_package and museum_object.latest_package.sip_id == sip_id) # If package was created, cancel it if is_same_package: museum_object.latest_package.cancelled = True # Copy log files to the archive if they were created try: museum_package = MuseumObjectPackage.from_path_sync( Path(PACKAGE_DIR) / str(object_id), sip_id=sip_id) museum_package.copy_log_files_to_archive(ARCHIVE_DIR) except FileNotFoundError: # No object directory and/or log files were created for this # package yet pass try: shutil.rmtree(Path(PACKAGE_DIR) / str(object_id)) except FileNotFoundError: # Object directory didn't exist yet pass
def update_sip(sip, sftp, queue): """ Update a single SIP by downloading its ingest reports and enqueing the final task to confirm it """ with scoped_session() as db: db_museum_package = (db.query(MuseumPackage).join( MuseumObject, MuseumObject.id == MuseumPackage.museum_object_id).filter( and_(MuseumPackage.sip_filename == sip.sip_filename, MuseumPackage.preserved == False, MuseumPackage.rejected == False)).one_or_none()) if not db_museum_package: return if sip.status == "accepted": # Package was accepted db_museum_package.preserved = True elif sip.status == "rejected": db_museum_package.rejected = True object_id = db_museum_package.museum_object.id package_dir = Path(PACKAGE_DIR) / str(object_id) museum_package = MuseumObjectPackage.from_path_sync(package_dir) xml_temp_path = museum_package.log_dir / "ingest-report.xml.download" xml_report_path = museum_package.log_dir / "ingest-report.xml" # HTML report also exists with the same path and name, but different # suffix html_remote_path = sip.report_path.with_suffix(".html") html_temp_path = museum_package.log_dir / "ingest-report.html.download" html_report_path = museum_package.log_dir / "ingest-report.html" # Download ingest report to the log directory sftp.get(str(sip.report_path), str(xml_temp_path)) os.rename(xml_temp_path, xml_report_path) sftp.get(str(html_remote_path), str(html_temp_path)) os.rename(html_temp_path, html_report_path) # Remove the directory containing the rejected SIP so that the DPRES # service does not store the package unnecessarily if sip.status == "rejected": sftp_rmtree(sftp, sip.transfer_path) # Write the status for use by the 'confirm_sip' task (package_dir / f"{sip.sip_filename}.status").write_text(sip.status) # Enqueue the final task queue.enqueue(confirm_sip, kwargs={ "object_id": object_id, "sip_id": db_museum_package.sip_id }, job_id=f"confirm_sip_{object_id}")
def unfreeze_objects(reason=None, object_ids=None, enqueue=False): """ Unfreeze objects with the given reason and/or object IDs. This allows them to be preserved again. :param str reason: Unfreeze objects with this reason :param list object_ids: Objects to unfreeze. :param bool enqueue: Whether to enqueue the unfrozen objects immediately. Default is False. """ connect_db() if not reason and not object_ids: raise ValueError("Either 'reason' or 'object_ids' has to be provided") with lock_queues(): with scoped_session() as db: query = ( db.query(MuseumObject) .outerjoin( MuseumPackage, MuseumPackage.id == MuseumObject.latest_package_id ) .filter(MuseumObject.frozen == True) ) if reason: query = query.filter(MuseumObject.freeze_reason == reason) if object_ids: object_ids = [int(object_id) for object_id in object_ids] query = query.filter(MuseumObject.id.in_(object_ids)) museum_objects = list(query) for museum_object in museum_objects: museum_object.frozen = False museum_object.freeze_reason = None museum_object.freeze_source = None # Remove the latest package if it was *not* successfully # preserved to ensure the object is eligible for preservation remove_latest_package = ( museum_object.latest_package and not museum_object.latest_package.preserved ) if remove_latest_package: museum_object.latest_package = None if enqueue: enqueue_object(object_id=museum_object.id) return len(museum_objects)
def submit_sip(object_id, sip_id): """ Submit SIP to the DPRES service. The next workflow task will be enqueued by 'sync_processed_sips' which periodically checks the processed SIPs """ object_id = int(object_id) connect_db() package_dir = Path(PACKAGE_DIR) / str(object_id) # Retrieve the latest SIP filename museum_package = MuseumObjectPackage.from_path_sync(package_dir, sip_id=sip_id) filename = museum_package.sip_filename with scoped_session() as db: package_uploaded = db.query(exists().where( and_(MuseumPackage.sip_filename == museum_package.sip_filename, MuseumPackage.uploaded == True))).scalar() if package_uploaded: raise RuntimeError(f"Package {filename} already uploaded") print(f"Submitting {filename} for Object {object_id}") museum_package = main(object_id=object_id, package_dir=PACKAGE_DIR, sip_id=sip_id) print(f"Package {filename} submitted, removing local file") with scoped_session() as db: db_museum_package = db.query(MuseumPackage).filter_by( sip_filename=museum_package.sip_filename).one() db_museum_package.uploaded = True # Delete the generated SIP to free space os.remove(museum_package.sip_archive_path)
def finish_sync_progress(name): """ Finish the current synchronization run. This ensures the next synchronization run will only iterate a subset of entries from MuseumPlus, improving performance. """ with scoped_session() as db: sync_status = _get_sync_status(db, name) # Next synchronization will start from beginning sync_status.offset = 0 sync_status.prev_start_sync_date = sync_status.start_sync_date sync_status.start_sync_date = None
def get_sync_status(name): """ Load the SyncStatus instance and return it for reading """ with scoped_session() as db: sync_status = _get_sync_status(db, name) # Return a read-only copy of the sync status to prevent having to deal # with a SQLAlchemy session that's not used for anything # TODO: Can we do this without having to use a namedtuple? return SyncStatusReadOnly( name=sync_status.name, start_sync_date=sync_status.start_sync_date, prev_start_sync_date=sync_status.prev_start_sync_date, offset=sync_status.offset)
def enqueue_objects(object_count, random=False, object_ids=None): """ Enqueue given number of objects to the preservation workflow. :param int object_count: How many objects to enqueue at most :param bool random: Whether to enqueue objects at random instead of in-order. :param list object_ids: Object IDs to enqueue. If provided, 'object_count' and 'random' are ignored. """ if object_ids: object_count = len(object_ids) with lock_queues(): connect_db() enqueued_object_ids = get_enqueued_object_ids() new_job_count = 0 with scoped_session() as db: object_query = (db.query(MuseumObject).with_transformation( MuseumObject.filter_preservation_pending).yield_per(500)) if object_ids: object_query = object_query.filter( MuseumObject.id.in_(object_ids)) if random: object_query = object_query.order_by(func.random()) for museum_object in object_query: if museum_object.id not in enqueued_object_ids: enqueue_object(museum_object.id) new_job_count += 1 print(f"Enqueued download_object_{museum_object.id}") if new_job_count >= object_count: break print(f"{new_job_count} object(s) enqueued for download") return new_job_count
def reenqueue_object(object_id: int): """ Re-enqueue rejected object into the workflow """ object_id = int(object_id) connect_db() queue = get_queue(QueueType.DOWNLOAD_OBJECT) with scoped_session() as db: museum_object = ( db.query(MuseumObject) .join( MuseumPackage, MuseumObject.latest_package_id == MuseumPackage.id ) .filter(MuseumObject.id == object_id) .one() ) if museum_object.latest_package and \ not museum_object.latest_package.rejected: raise ValueError( f"Latest package {museum_object.latest_package.sip_filename} " f"wasn't rejected" ) object_ids = get_enqueued_object_ids() if object_id in object_ids: raise ValueError( f"Object is still in the workflow and can't be re-enqueued" ) museum_object.latest_package = None delete_jobs_for_object_id(object_id) queue.enqueue( download_object, kwargs={"object_id": object_id}, job_id=f"download_object_{object_id}" )
def confirm_sip(object_id, sip_id): """ Confirm SIP that was either preserved or rejected by the DPRES service. This is the last step in the preservation workflow. """ object_id = int(object_id) connect_db() package_dir = Path(PACKAGE_DIR) / str(object_id) museum_package = MuseumObjectPackage.from_path_sync(package_dir, sip_id=sip_id) # '.status' file contains either the text 'accepted' or 'rejected' status = (museum_package.path / f"{museum_package.sip_filename}.status").read_text() if status not in ("accepted", "rejected"): raise ValueError(f"Invalid preservation status: {status}") print(f"Confirming SIP {museum_package.sip_filename}") main(object_id=object_id, package_dir=PACKAGE_DIR, archive_dir=ARCHIVE_DIR, sip_id=sip_id, status=status) with scoped_session() as db: db.query(MuseumPackage).filter_by( sip_filename=museum_package.sip_filename).update({ MuseumPackage.preserved: bool(status == "accepted"), MuseumPackage.rejected: bool(status == "rejected") }) if status == "accepted": db.query(MuseumObject).filter_by(id=object_id).update( {MuseumObject.preserved: True}) print(f"SIP {museum_package.sip_filename} confirmed")
def get_confirmed_sip_filenames(days: int) -> set: """ Get a set of SIP filenames that have already been marked as preserved or rejected in the workflow. The SIPs can be safely skipped as they're either already confirmed or the corresponding workflow job has been enqueued. """ # Find packages that are at most (days + 2) days old. The extra two days # are used to account for days that took longer to get processed for # whatever reason. cutoff = (datetime.datetime.now(datetime.timezone.utc) - datetime.timedelta(days=days + 2)) with scoped_session() as db: query = (select([MuseumPackage.sip_filename ]).where(MuseumPackage.created_date > cutoff).where( or_(MuseumPackage.preserved, MuseumPackage.rejected))) results = db.execute(query) results = {result[0] for result in results} return results
def reset_workflow(): """ Reset workflow after a PostgreSQL backup restoration by removing in-process packages that were not submitted to the DPRES service but were still in the workflow at the time the backup was initiated. """ with lock_queues(): connect_db() with scoped_session() as db: # Get objects that have been downloaded or packaged, but which # haven't been uploaded yet objects = (db.query(MuseumObject).join( MuseumPackage, MuseumPackage.id == MuseumObject.latest_package_id).filter( MuseumPackage.uploaded == False, or_(MuseumPackage.downloaded, MuseumPackage.packaged))) objects = list(objects) print(f"Found {len(objects)} dangling objects") for mus_object in objects: mus_package = mus_object.latest_package # Remove the lingering package from the MuseumObject to make # the object eligible for preservation again. mus_object.latest_package = None db.delete(mus_package) try: shutil.rmtree(Path(PACKAGE_DIR) / str(mus_object.id)) except OSError: # Directory does not exist; ignore pass print("Done!")
def create_sip(object_id, sip_id): """ Create SIP from a downloaded objec and enqueue the task 'submit_sip' once the object is packaged into a SIP """ object_id = int(object_id) connect_db() # Are we creating a SIP for the first time or updating a preserved # package? created_date, modified_date = None, None with scoped_session() as db: last_preserved_package = ( db.query(MuseumPackage) .filter(MuseumPackage.museum_object_id == object_id) .filter(MuseumPackage.preserved == True) .order_by(MuseumPackage.created_date.desc()) .first() ) current_package = ( db.query(MuseumObject) .join( MuseumPackage, MuseumObject.latest_package_id == MuseumPackage.id ) .filter(MuseumObject.id == object_id) .one() .latest_package ) if not last_preserved_package: # We haven't created a preserved SIP yet print(f"Creating submission SIP for Object {object_id}") created_date = current_package.created_date else: # We are updating an existing package print(f"Creating update SIP for Object {object_id}") created_date = last_preserved_package.created_date modified_date = current_package.created_date # Run the 'create_sip' script try: museum_package = main( object_id=object_id, package_dir=PACKAGE_DIR, sip_id=sip_id, create_date=created_date, modify_date=modified_date, update=bool(modified_date) ) except PreservationError as exc: # If a PreservationError was raised, freeze the object and prevent # the object from going further in the workflow. freeze_running_object( object_id=object_id, sip_id=sip_id, freeze_reason=exc.error ) return except OSError as exc: if exc.errno == errno.ENOSPC: raise OSError( errno.ENOSPC, "Ran out of disk space. This may have happened because the " "package directory ran out of space while downloading a " "large attachment. Try removing packages from the directory " "and trying again by processing less packages at the same " "time." ) raise filename = museum_package.sip_filename print(f"Created SIP for Object {object_id}, updating database") with scoped_session() as db: db_package = db.query(MuseumPackage).filter( MuseumPackage.sip_filename == filename ).one() db_package.packaged = True db.query(MuseumObject).filter( MuseumObject.id == object_id ).update({MuseumObject.latest_package_id: db_package.id}) queue = get_queue(QueueType.SUBMIT_SIP) queue.enqueue( submit_sip, kwargs={"object_id": object_id, "sip_id": sip_id}, job_id=f"submit_sip_{object_id}" )
async def sync_attachments(offset=0, limit=None, save_progress=False): """ Synchronize attachment metadata from MuseumPlus to determine which objects have changed and need to be updated in the DPRES service. This is followed by 'sync_hashes'. :param int offset: Offset to start synchronizing from :param int limit: How many attachments to sync before stopping. Default is None, meaning all available attachments are synchronized. :param bool save_progress: Whether to save synchronization progress and continue from the last run. Offset and limit are ignored if enabled. """ modify_date_gte = None if save_progress: limit = None sync_status = get_sync_status("sync_attachments") offset = sync_status.offset # Start synchronization from attachments that changed since the last # sync modify_date_gte = sync_status.prev_start_sync_date print(f"Continuing synchronization from {offset}") # TODO: This is pretty much an inverse version of 'sync_objects'. # This process should be made more generic if possible. museum_session = await get_museum_session() multimedia_iter = iterate_multimedia( session=museum_session, offset=offset, modify_date_gte=modify_date_gte ) all_iterated = False index = offset processed = 0 while True: results = [] all_iterated = True async for result in multimedia_iter: all_iterated = False results.append(result) index += 1 if len(results) >= CHUNK_SIZE: break attachments = {result["id"]: result for result in results} attachment_ids = list(attachments.keys()) inserts, updates = 0, 0 with scoped_session() as db: existing_attachment_ids = set([ result.id for result in db.query(MuseumAttachment).options(load_only("id")) .filter(MuseumAttachment.id.in_(attachment_ids)) ]) attachment_id2object_id = defaultdict(set) object_ids = set() update_params = [] # Create existing objects, update the rest for result in attachments.values(): attachment_id = int(result["id"]) filename = result["filename"] modified_date = result["modified_date"] created_date = result["created_date"] xml_hash = result["xml_hash"] attachment_id2object_id[attachment_id].update( result["object_ids"] ) object_ids.update(result["object_ids"]) if attachment_id in existing_attachment_ids: # Update update_params.append({ "_id": attachment_id, "_filename": filename, "_modified_date": modified_date, "_created_date": created_date, "_metadata_hash": xml_hash }) updates += 1 else: # Create mus_attachment = MuseumAttachment( id=attachment_id, filename=filename, modified_date=modified_date, created_date=created_date, metadata_hash=xml_hash ) db.add(mus_attachment) inserts += 1 processed += 1 if limit is not None and processed == limit: all_iterated = True break if update_params: # Perform updates in bulk stmt = ( MuseumAttachment.__table__.update() .where(MuseumAttachment.id == bindparam("_id")) .values({ "filename": bindparam("_filename"), "created_date": bindparam("_created_date"), "modified_date": bindparam("_modified_date"), "metadata_hash": bindparam("_metadata_hash") }) ) db.execute(stmt, update_params) # Create/update MuseumObjects with references # to the newly updated MuseumAttachments. # For performance reasons update references for a batch # of objects at once attachments = ( db.query(MuseumAttachment) .filter(MuseumAttachment.id.in_(attachment_ids)) ) objects = bulk_create_or_get(db, MuseumObject, object_ids) objects_by_id = { mus_object.id: mus_object for mus_object in objects } for attachment in attachments: attachment.museum_objects = [ objects_by_id[object_id] for object_id in attachment_id2object_id[attachment.id] ] for museum_object in attachment.museum_objects: # Set the modification date of MuseumObject to the same # as the attachment's if it's newer. # This is because we want to know if the museum object OR # one of its attachments has been changed. object_date_needs_update = ( not museum_object.modified_date or museum_object.modified_date < attachment.modified_date ) if object_date_needs_update: museum_object.modified_date = attachment.modified_date results = [] print( f"Updated, {inserts} inserts, {updates} " f"updates. Updating from offset: {index}" ) # Submit heartbeat after each successful iteration instead of once # at the end. This is because this script is designed to be stopped # before it has finished iterating everything. submit_heartbeat(HeartbeatSource.SYNC_ATTACHMENTS) if save_progress: update_offset("sync_attachments", offset=index) if all_iterated: if save_progress: finish_sync_progress("sync_attachments") break await museum_session.close()
def sync_hashes(): """ Update object entries with latest metadata hashes to determine which objects have been changed. This is done after 'sync_objects' and 'sync_attachments'. """ updated = 0 skipped = 0 total = 0 with scoped_session() as db: query = iterate_museum_objects_and_attachments(db) all_iterated = False while True: results = [] for i in range(0, CHUNK_SIZE): try: results.append(next(query)) except StopIteration: all_iterated = True break update_params = [] for museum_object, museum_attachments in results: total += 1 # Calculate the attachment metadata hash if museum_attachments: # Don't calculate the hash if some attachments are # incomplete metadata_incomplete = any(attach.metadata_hash is None for attach in museum_attachments) if metadata_incomplete: skipped += 1 continue attachment_metadata_hash = get_metadata_hash_for_attachments( museum_attachments) else: attachment_metadata_hash = "" if museum_object.attachment_metadata_hash \ == attachment_metadata_hash: # Attachment hash hasn't changed, no need to update continue updated += 1 update_params.append({ "_id": museum_object.id, "_attachment_metadata_hash": attachment_metadata_hash }) if update_params: update_stmt = (MuseumObject.__table__.update().where( MuseumObject.id == bindparam("_id")).values({ "attachment_metadata_hash": bindparam("_attachment_metadata_hash") })) db.execute(update_stmt, update_params) print(f"{total} iterated, {updated} updated and {skipped} skipped " "so far") if all_iterated: break submit_heartbeat(HeartbeatSource.SYNC_HASHES)
def freeze_objects(object_ids, reason, source, delete_jobs=True): """ Freeze objects to prevent them from being included in the preservation workflow :returns: (freeze_count, cancel_count) tuple for how many objects were frozen and how many packages were cancelled as a result """ object_ids = [int(object_id) for object_id in object_ids] source = FreezeSource(source) with lock_queues(): # Are there object IDs that we're about to freeze but that are # still running? running_object_ids = get_running_object_ids() conflicting_object_ids = set(object_ids) & set(running_object_ids) if conflicting_object_ids: raise WorkflowJobRunningError( "The following object IDs have running jobs and can't be " f"frozen: {', '.join([str(o) for o in sorted(conflicting_object_ids)])}" ) connect_db() with scoped_session() as db: freeze_count = (db.query(MuseumObject).filter( MuseumObject.id.in_(object_ids)).update( { MuseumObject.frozen: True, MuseumObject.freeze_reason: reason, MuseumObject.freeze_source: source }, synchronize_session=False)) packages_to_cancel = list( db.query(MuseumPackage).join( MuseumObject, MuseumObject.latest_package_id == MuseumPackage.id).filter( MuseumPackage.museum_object_id.in_(object_ids), MuseumPackage.preserved == False, MuseumPackage.rejected == False, MuseumPackage.cancelled == False)) for package in packages_to_cancel: package.cancelled = True try: museum_package = MuseumObjectPackage.from_path_sync( Path(PACKAGE_DIR) / str(package.museum_object_id), sip_id=package.sip_id) museum_package.copy_log_files_to_archive(ARCHIVE_DIR) except FileNotFoundError: # If the SIP doesn't exist, just skip it pass # Cancel any jobs for each object ID if enabled if delete_jobs: for object_id in object_ids: delete_jobs_for_object_id(object_id) # Delete the museum package directory try: shutil.rmtree(Path(PACKAGE_DIR) / str(object_id)) except OSError: # Directory does not exist pass return freeze_count, len(packages_to_cancel)
def download_object(object_id): """ Download an object from MuseumPlus and enqueue the task 'create_sip' once the object is downloaded """ object_id = int(object_id) connect_db() # Create a SIP id from the current time sip_id = datetime.datetime.now( datetime.timezone.utc).strftime("%Y%m%d-%H%M%S") try: museum_package = main( object_id=int(object_id), package_dir=PACKAGE_DIR, # 'sip_id' is optional, but giving it as a kwarg ensures the # filename of the SIP is correct before it is created. sip_id=sip_id) except PreservationError as exc: # If a PreservationError was raised, freeze the object freeze_running_object(object_id=object_id, sip_id=sip_id, freeze_reason=exc.error) return except OSError as exc: if exc.errno == errno.ENOSPC: raise OSError( errno.ENOSPC, "Ran out of disk space. This may have happened because the " "package directory ran out of space while downloading a " "large attachment. Try removing packages from the directory " "and trying again by processing less packages at the same " "time.") raise filename = museum_package.sip_filename with scoped_session() as db: db_museum_object = db.query(MuseumObject).filter( MuseumObject.id == object_id).one() db_package = db.query(MuseumPackage).filter_by( sip_filename=filename).first() # Get the attachments that currently exist for this object # and add them to the new MuseumPackage attachment_ids = museum_package.museum_object.attachment_ids db_attachments = bulk_create_or_get(db, MuseumAttachment, attachment_ids) if not db_package: db_package = MuseumPackage( sip_filename=filename, sip_id=sip_id, object_modified_date=( museum_package.museum_object.modified_date), downloaded=True, metadata_hash=db_museum_object.metadata_hash, attachment_metadata_hash=( db_museum_object.attachment_metadata_hash), attachments=db_attachments) db_package.museum_object = db_museum_object else: raise EnvironmentError( f"Package with filename {filename} already exists") db_museum_object.latest_package = db_package queue = get_queue(QueueType.CREATE_SIP) queue.enqueue(create_sip, kwargs={ "object_id": object_id, "sip_id": sip_id }, job_id=f"create_sip_{object_id}")
async def sync_objects(offset=0, limit=None, save_progress=False): """ Synchronize object metadata from MuseumPlus to determine which objects have changed and need to be updated in the DPRES service. This is followed by 'sync_hashes'. :param int offset: Offset to start synchronizing from :param int limit: How many objects to sync before stopping. Default is None, meaning all available objects are synchronized. :param bool save_progress: Whether to save synchronization progress and continue from the last run. Offset and limit are ignored if enabled. """ modify_date_gte = None if save_progress: limit = None sync_status = get_sync_status("sync_objects") offset = sync_status.offset # Start synchronization from objects that changed since the last # sync modify_date_gte = sync_status.prev_start_sync_date print(f"Continuing synchronization from {offset}") museum_session = await get_museum_session() object_iter = iterate_objects(session=museum_session, offset=offset, modify_date_gte=modify_date_gte) all_iterated = False index = offset processed = 0 while True: results = [] all_iterated = True async for result in object_iter: all_iterated = False results.append(result) index += 1 if len(results) >= CHUNK_SIZE: break objects = {result["id"]: result for result in results} object_ids = list(objects.keys()) inserts, updates = 0, 0 with scoped_session() as db: existing_object_ids = set([ result.id for result in db.query(MuseumObject).options( load_only("id")).filter(MuseumObject.id.in_(object_ids)) ]) object_id2attachment_id = defaultdict(set) attachment_ids = set() update_params = [] # Create existing objects, update the rest for result in objects.values(): object_id = int(result["id"]) title = result["title"] modified_date = result["modified_date"] created_date = result["created_date"] multimedia_ids = result["multimedia_ids"] xml_hash = result["xml_hash"] object_id2attachment_id[object_id].update(multimedia_ids) attachment_ids.update(multimedia_ids) if object_id in existing_object_ids: # Don't run the update query instantly; instead, # set the parameters and run them all together later # in bulk update_params.append({ "_id": object_id, "_title": title, "_modified_date": modified_date, "_metadata_hash": xml_hash }) updates += 1 else: # Create mus_object = MuseumObject(id=object_id, title=title, modified_date=modified_date, created_date=created_date, metadata_hash=xml_hash) db.add(mus_object) inserts += 1 processed += 1 if limit is not None and processed == limit: all_iterated = True break if update_params: # Perform updates in bulk stmt_a = (MuseumObject.__table__.update().where( MuseumObject.id == bindparam("_id")).values({ "title": bindparam("_title"), "metadata_hash": bindparam("_metadata_hash") })) stmt_b = (MuseumObject.__table__.update().where( and_( MuseumObject.id == bindparam("_id"), or_( MuseumObject.modified_date == None, MuseumObject.modified_date < bindparam("_modified_date")))).values( {"modified_date": bindparam("_modified_date")})) db.execute(stmt_a, update_params) db.execute(stmt_b, update_params) # Create/update MuseumAttachments with references # to the newly updated MuseumObjects. # For performance reasons update references for a batch # of objects at once objects = (db.query(MuseumObject).filter( MuseumObject.id.in_(object_ids))) attachments = bulk_create_or_get(db, MuseumAttachment, attachment_ids) attachments_by_id = { attachment.id: attachment for attachment in attachments } for museum_object in objects: museum_object.attachments = [ attachments_by_id[attachment_id] for attachment_id in object_id2attachment_id[museum_object.id] ] results = [] print(f"Updated, {inserts} inserts, {updates} " f"updates. Updating from offset: {index}") # Submit heartbeat after each successful iteration instead of once # at the end. This is because this script is designed to be stopped # before it has finished iterating everything. submit_heartbeat(HeartbeatSource.SYNC_OBJECTS) if save_progress: update_offset("sync_objects", offset=index) if all_iterated: if save_progress: finish_sync_progress("sync_objects") break await museum_session.close()