def test_create_avalon_dip_success(self): """ Test full DIP creation: - AIP download - AIP extraction - DIP folder creation - METS file creation - Objects folder creation - ZIP file creation - Files inside ZIP created with original filename and lastmodified date """ with TmpDir(TMP_DIR): # Download the AIP first aip_path = amclient.AMClient( aip_uuid=AVALON_AIP_UUID, ss_url=SS_URL, ss_user_name=SS_USER_NAME, ss_api_key=SS_API_KEY, directory=TMP_DIR, ).download_aip() # Extract it aip_dir = create_dip.extract_aip(aip_path, AVALON_AIP_UUID, TMP_DIR) # Test DIP creation avalon_dip_dir = create_dip.create_dip( aip_dir, AVALON_AIP_UUID, OUTPUT_DIR, "atom", "avalon-manifest" ) # Check DIP structure assert avalon_dip_dir == "{}/{}/{}".format( OUTPUT_DIR, TRANSFER_NAME, AVALON_AIP_UUID ) assert os.path.isdir(avalon_dip_dir) # Check that CSV and folder are present, and METS file is removed assert sorted(os.listdir(avalon_dip_dir)) == sorted( ["Demo_Manifest.csv", "assets"] ) # Check contents of CSV have been updated csv_path = "{}/Demo_Manifest.csv".format(avalon_dip_dir) is_in_file = False with open(csv_path, "rt") as c: demo_manifest = csv.reader(c, delimiter=",") for row in demo_manifest: if AVALON_AIP_UUID in row: is_in_file = True assert is_in_file # Check that files are present avalon_files = os.listdir("{}/assets".format(avalon_dip_dir)) assets = [ "agz3068a.wav", "lunchroom_manners_512kb.mp4", "lunchroom_manners_512kb.mp4.structure.xml", "lunchroom_manners_512kb.mp4.vtt", "OrganClip.high.mp4", "OrganClip.low.mp4", "OrganClip.medium.mp4", ] assert sorted(assets) == sorted(avalon_files)
import sys import os import time import settings import amclient import requests import logging import shutil # Initialize Archivematica Python Client am = amclient.AMClient() # Import AM API info from settings.py am.am_url = settings.am_url am.am_api_key = settings.am_api_key am.am_user_name = settings.am_user_name am.ss_url = settings.ss_url am.ss_api_key = settings.ss_api_key am.ss_user_name = settings.ss_user_name # Set institution code from user input institution = sys.argv[1] # institution = settings.INSTITUTION[institution] # Import institutional variables from settings.py am.transfer_source = settings.INSTITUTION[institution]['transfer_source'] am.transfer_type = settings.INSTITUTION[institution]['transfer_type'] am.processing_config = settings.INSTITUTION[institution]['processing_config'] transfer_folder = '/' + institution + 'islandora/transfer/' # this is the directory to be watched
def main( ss_url, ss_user, ss_api_key, aip_uuid, tmp_dir, output_dir, mets_type="atom", dip_type="zipped-objects", ): LOGGER.info("Starting DIP creation from AIP: %s", aip_uuid) if not os.path.isdir(tmp_dir): LOGGER.error("%s is not a valid temporary directory", tmp_dir) return 1 if not os.path.isdir(output_dir): LOGGER.error("%s is not a valid output directory", output_dir) return 2 # Create empty workspace directory tmp_dir = os.path.join(tmp_dir, aip_uuid) if os.path.exists(tmp_dir): LOGGER.warning("Workspace directory already exists, overwriting") shutil.rmtree(tmp_dir) try: os.makedirs(tmp_dir) except OSError: LOGGER.error("Could not create workspace directory: %s", tmp_dir) return 3 LOGGER.info("Downloading AIP from Storage Service") am_client = amclient.AMClient( aip_uuid=aip_uuid, ss_url=ss_url, ss_user_name=ss_user, ss_api_key=ss_api_key, directory=tmp_dir, ) aip_file = am_client.download_aip() if not aip_file: LOGGER.error("Unable to download AIP") return 4 LOGGER.info("Extracting AIP") aip_dir = extract_aip(aip_file, aip_uuid, tmp_dir) if not aip_dir: return 5 LOGGER.info("Creating DIP") dip_dir = create_dip(aip_dir, aip_uuid, output_dir, mets_type, dip_type) if not dip_dir: LOGGER.error("Unable to create DIP") return 6 # Remove workspace directory shutil.rmtree(tmp_dir) LOGGER.info("DIP created in: %s", dip_dir) return dip_dir
def process_dip( ss_url, ss_user, ss_api_key, dip, pipeline_uuid, processing_uuid, s3_uuid, shared_directory, dip_path, ): # remove ContentDm file contentdm = os.path.join(shared_directory, dip_path, dip, "objects/compound.txt") try: os.remove(contentdm) except Exception as e: LOGGER.warning("Unable to remove contentDM file: %s", e) # Get AIP UUID aip_uuid = dip[-36:] # USE AM Client to get info on the AIP am_client = amclient.AMClient( package_uuid=aip_uuid, ss_url=ss_url, ss_user_name=ss_user, ss_api_key=ss_api_key, ) try: aip_details = am_client.get_package_details() except Exception as e: LOGGER.error("Unable to locate valid AIP package: %s", e) return 2 # Get file list in DIP object_list = [] thumbnail_list = [] object_path = os.path.join(shared_directory, dip_path, dip, "objects") for root, _, files in os.walk(object_path): for name in files: rel_dir = os.path.relpath( root, os.path.join(shared_directory, dip_path, dip)) object_list.append(os.path.join(rel_dir, name)) if not object_list: LOGGER.error("Unable to find any access files in the DIP.") return 2 thumbnail_path = os.path.join(shared_directory, dip_path, dip, "thumbnails") for root, _, files in os.walk(thumbnail_path): for name in files: rel_dir = os.path.relpath( root, os.path.join(shared_directory, dip_path, dip)) thumbnail_list.append(os.path.join(rel_dir, name)) # get mets file mets_name = "METS." + aip_uuid + ".xml" try: mets = metsrw.METSDocument.fromfile( os.path.join(shared_directory, dip_path, dip, mets_name)) except Exception as e: LOGGER.error("Unable to extract and load METS file: %s", e) return 2 # Compile data for upload to S3 size = 0 for dirpath, _, filenames in os.walk( os.path.join(shared_directory, dip_path, dip)): for filename in filenames: file_path = os.path.join(dirpath, filename) size += os.path.getsize(file_path) dip_data = { "origin_pipeline": "/api/v2/pipeline/" + pipeline_uuid + "/", "origin_location": "/api/v2/location/" + processing_uuid + "/", "origin_path": os.path.join(dip_path, dip) + "/", "current_location": "/api/v2/location/" + s3_uuid + "/", "current_path": dip, "package_type": "DIP", "aip_subtype": "Archival Information Package", # same as in AM "size": size, "related_package_uuid": aip_uuid, } LOGGER.info("Storing DIP in S3 location.") url = ss_url + "/api/v2/file/" headers = {"Authorization": "ApiKey " + ss_user + ":" + ss_api_key + ""} response = requests.post(url, headers=headers, json=dip_data, timeout=86400) if response.status_code != requests.codes.created: LOGGER.error("Could not store DIP in S3 location: %s", response.text) return 2 else: LOGGER.info("DIP stored in S3 location.") ret = response.json() if "uuid" in ret: dip_uuid = ret["uuid"] LOGGER.info("Storage Service DIP UUID: %s" % ret["uuid"]) else: LOGGER.error("Storage Service didn't return the DIP UUID") return 2 # USE AM Client to get info on the DIP LOGGER.info("Compiling DIP info.") am_client = amclient.AMClient( package_uuid=dip_uuid, ss_url=ss_url, ss_user_name=ss_user, ss_api_key=ss_api_key, ) dip_details = am_client.get_package_details() dip_info = {} dip_info["dip-uuid"] = dip_details["uuid"] dip_info["dip-path"] = dip_details["current_full_path"] # get bucket and region location_url = ss_url + dip_details["current_location"] headers = {"Authorization": "ApiKey " + ss_user + ":" + ss_api_key + ""} location_response = requests.get(location_url, headers=headers, timeout=86400) space_url = ss_url + location_response.json()['space'] space_response = requests.get(space_url, headers=headers, timeout=86400) dip_info["dip-bucket"] = space_response.json().get('bucket', "") dip_info["dip-region"] = space_response.json().get('region', "") dip_info["object-list"] = object_list dip_info["thumbnail-list"] = thumbnail_list dip_info["aip-uuid"] = aip_uuid # get related AIP package info dip_info["aip-path"] = aip_details["current_full_path"] # get bucket and region location_url = ss_url + aip_details["current_location"] headers = {"Authorization": "ApiKey " + ss_user + ":" + ss_api_key + ""} location_response = requests.get(location_url, headers=headers, timeout=86400) space_url = ss_url + location_response.json()['space'] space_response = requests.get(space_url, headers=headers, timeout=86400) dip_info["aip-bucket"] = space_response.json().get('bucket', "") dip_info["aip-region"] = space_response.json().get('region', "") # GET REPLICATED AIP PACKAGE INFO if aip_details["replicas"]: replica_uuid = os.path.basename(aip_details["replicas"][0][:-1]) am_client = amclient.AMClient( package_uuid=replica_uuid, ss_url=ss_url, ss_user_name=ss_user, ss_api_key=ss_api_key, ) replica_details = am_client.get_package_details() dip_info["replica-uuid"] = replica_uuid dip_info["replica-path"] = replica_details["current_full_path"] # get bucket and region location_url = ss_url + replica_details["current_location"] headers = { "Authorization": "ApiKey " + ss_user + ":" + ss_api_key + "" } location_response = requests.get(location_url, headers=headers, timeout=86400) space_url = ss_url + location_response.json()['space'] space_response = requests.get(space_url, headers=headers, timeout=86400) dip_info["replica-bucket"] = space_response.json().get('bucket', "") dip_info["replica-region"] = space_response.json().get('region', "") else: dip_info["replica-uuid"] = "" dip_info["replica-bucket"] = "" dip_info["replica-region"] = "" # Return the data return dip_info, mets
def main( ss_url, ss_user, ss_api_key, location_uuid, tmp_dir, output_dir, database_file, delete_local_copy, upload_type, pipeline_uuid, cp_location_uuid, ds_location_uuid, shared_directory, atom_url, atom_email, atom_password, atom_slug, rsync_target, ): LOGGER.info("Processing AIPs in SS location: %s", location_uuid) # Idempotently create database and Aip table and create session try: session = models.init(database_file) except IOError: LOGGER.error("Could not create database in: %s", database_file) return 1 # Get UPLOADED and VERIFIED AIPs from the SS try: am_client = amclient.AMClient( ss_url=ss_url, ss_user_name=ss_user, ss_api_key=ss_api_key ) # There is an issue in the SS API that avoids # filtering the results by location. See: # https://github.com/artefactual/archivematica-storage-service/issues/298 aips = am_client.aips({"status__in": "UPLOADED,VERIFIED"}) except Exception as e: LOGGER.error(e) return 2 # Get only AIPs from the specified location aip_uuids = filter_aips(aips, location_uuid) # Create DIPs for those AIPs for uuid in aip_uuids: try: # To avoid race conditions while checking for an existing AIP # and saving it, create the row directly and check for an # integrity error exception (the uuid is a unique column) db_aip = models.Aip(uuid=uuid) session.add(db_aip) session.commit() except exc.IntegrityError: session.rollback() LOGGER.debug("Skipping AIP (already processed/processing): %s", uuid) continue mets_type = "atom" if upload_type == "ss-upload": mets_type = "storage-service" dip_path = create_dip.main( ss_url=ss_url, ss_user=ss_user, ss_api_key=ss_api_key, aip_uuid=uuid, tmp_dir=tmp_dir, output_dir=output_dir, mets_type=mets_type, ) # Do not try upload on creation error if type(dip_path) == int: LOGGER.error("Could not create DIP from AIP: %s", uuid) continue if upload_type == "ss-upload": storage_service_upload.main( ss_url=ss_url, ss_user=ss_user, ss_api_key=ss_api_key, pipeline_uuid=pipeline_uuid, cp_location_uuid=cp_location_uuid, ds_location_uuid=ds_location_uuid, shared_directory=shared_directory, dip_path=dip_path, aip_uuid=uuid, delete_local_copy=delete_local_copy, ) elif upload_type == "atom-upload": atom_upload.main( atom_url=atom_url, atom_email=atom_email, atom_password=atom_password, atom_slug=atom_slug, rsync_target=rsync_target, dip_path=dip_path, delete_local_copy=delete_local_copy, ) LOGGER.info("All AIPs have been processed")