def find_aip_dirname(mets_root): """Find name of AIP directory within AIP METS document. :param mets_root: AIP METS document root. :returns: AIP dirname or None. """ return xml_find_premis(mets_root, "mets:structMap/mets:div").get("LABEL")
def find_aic_mets_filename(mets_root): """Find name of AIC METS file within AIP METS document. :param mets_root: AIP METS document root. :returns: AIC METS filename or None. """ return xml_find_premis( mets_root, "mets:fileSec/mets:fileGrp[@USE='metadata']/mets:file/mets:FLocat" ).get("{" + NSMAP["xlink"] + "}href")
def get_aips_in_aic(mets_root, archive_path, temp_dir): """Return the number of AIPs in the AIC, extracted from AIC METS file.""" # Find name of AIC METS file try: # aic_mets_filename includes metadata/ aic_mets_filename = ns.xml_find_premis( mets_root, "mets:fileSec/mets:fileGrp[@USE='metadata']/mets:file/mets:FLocat", ).get("{" + ns.NSMAP["xlink"] + "}href") aip_dirname = ns.xml_find_premis( mets_root, "mets:structMap/mets:div").get("LABEL") except Exception: # Catch any parsing errors return None # Extract AIC METS file aic_mets_path = extract_file( archive_path=archive_path, destination_dir=temp_dir, relative_path=os.path.join(aip_dirname, "data", aic_mets_filename), ) # Parse for number of AIPs aic_root = etree.parse(aic_mets_path) extent = ns.xml_find_premis( aic_root, "mets:dmdSec/mets:mdWrap/mets:xmlData/dcterms:dublincore/dcterms:extent", ) try: aips_in_aic = re.search("\d+", extent.text).group() except AttributeError: # Probably because extent was None # Or the search returned None return None return aips_in_aic
def find_aips_in_aic(aic_root): """Find extent of AIPs in AIC within AIC METS document. :param aic_root" AIC METS document root. :returns: Count of AIPs in AIC or None. """ extent = xml_find_premis( aic_root, "mets:dmdSec/mets:mdWrap/mets:xmlData/dcterms:dublincore/dcterms:extent", ) try: return re.search("\d+", extent.text).group() except AttributeError: return None
def processAIPThenDeleteMETSFile(path, temp_dir, es_client, delete_existing_data=False): archive_file = os.path.basename(path) # Regex match the UUID - AIP might end with .7z, .tar.bz2, or # something else. match = re.search( r"[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}", archive_file) if match is not None: aip_uuid = match.group() else: return -1 print("Processing AIP", aip_uuid) if delete_existing_data is True: print("Deleting AIP", aip_uuid, "from aips/aip and aips/aipfile.") elasticSearchFunctions.delete_aip(es_client, aip_uuid) elasticSearchFunctions.delete_aip_files(es_client, aip_uuid) # AIP filenames are <name>-<uuid><extension> # Index of match end is right before the extension subdir = archive_file[:match.end()] aip_name = subdir[:-37] mets_file = "METS." + aip_uuid + ".xml" mets_file_relative_path = os.path.join("data", mets_file) if os.path.isfile(path): mets_file_relative_path = os.path.join(subdir, mets_file_relative_path) path_to_mets = extract_file( archive_path=path, destination_dir=temp_dir, relative_path=mets_file_relative_path, ) # If AIC, need to extract number of AIPs in AIC to index as well aips_in_aic = None root = etree.parse(path_to_mets) try: aip_type = ns.xml_find_premis( root, "mets:dmdSec/mets:mdWrap/mets:xmlData/dcterms:dublincore/dcterms:type" ).text except AttributeError: pass else: if aip_type == "Archival Information Collection": aips_in_aic = get_aips_in_aic(root, path, temp_dir) aip_info = storage_service.get_file_info(uuid=aip_uuid) if not aip_info: print("Information not found in Storage Service for AIP UUID: ", aip_uuid) return 1 return elasticSearchFunctions.index_aip_and_files( client=es_client, uuid=aip_uuid, aip_stored_path=path, mets_staging_path=path_to_mets, name=aip_name, aip_size=aip_info[0]["size"], aips_in_aic=aips_in_aic, identifiers=[], # TODO get these )
def _index_aip_files(client, uuid, mets_path, name, identifiers=[], printfn=print): """Index AIP files from AIP with UUID `uuid` and METS at path `mets_path`. :param client: The ElasticSearch client. :param uuid: The UUID of the AIP we're indexing. :param mets_path: path on disk where the AIP's METS file is located. :param name: AIP name. :param identifiers: optional additional identifiers (MODS, Islandora, etc.). :param printfn: optional print funtion. :return: number of files indexed. """ # Parse XML tree = ElementTree.parse(mets_path) root = tree.getroot() # TODO: Add a conditional to toggle this _remove_tool_output_from_mets(tree) # Extract isPartOf (for AIPs) or identifier (for AICs) from DublinCore dublincore = ns.xml_find_premis( root, "mets:dmdSec/mets:mdWrap/mets:xmlData/dcterms:dublincore") aic_identifier = None is_part_of = None if dublincore is not None: aip_type = ns.xml_findtext_premis(dublincore, "dc:type") or ns.xml_findtext_premis( dublincore, "dcterms:type") if aip_type == "Archival Information Collection": aic_identifier = ns.xml_findtext_premis( dublincore, "dc:identifier") or ns.xml_findtext_premis( dublincore, "dcterms:identifier") elif aip_type == "Archival Information Package": is_part_of = ns.xml_findtext_premis(dublincore, "dcterms:isPartOf") # Establish structure to be indexed for each file item fileData = { "archivematicaVersion": version.get_version(), "AIPUUID": uuid, "sipName": name, "FILEUUID": "", "indexedAt": time.time(), "filePath": "", "fileExtension": "", "isPartOf": is_part_of, "AICID": aic_identifier, "METS": { "dmdSec": {}, "amdSec": {} }, "origin": get_dashboard_uuid(), "identifiers": identifiers, "transferMetadata": _extract_transfer_metadata(root), } # Index all files in a fileGrup with USE='original' or USE='metadata' original_files = ns.xml_findall_premis( root, "mets:fileSec/mets:fileGrp[@USE='original']/mets:file") metadata_files = ns.xml_findall_premis( root, "mets:fileSec/mets:fileGrp[@USE='metadata']/mets:file") files = original_files + metadata_files # Index AIC METS file if it exists for file_ in files: indexData = fileData.copy() # Deep copy of dict, not of dict contents # Get file UUID. If and ADMID exists, look in the amdSec for the UUID, # otherwise parse it out of the file ID. # 'Original' files have ADMIDs, 'Metadata' files don't admID = file_.attrib.get("ADMID", None) if admID is None: # Parse UUID from file ID fileUUID = None uuix_regex = r"\w{8}-?\w{4}-?\w{4}-?\w{4}-?\w{12}" uuids = re.findall(uuix_regex, file_.attrib["ID"]) # Multiple UUIDs may be returned - if they are all identical, use that # UUID, otherwise use None. # To determine all UUIDs are identical, use the size of the set if len(set(uuids)) == 1: fileUUID = uuids[0] else: amdSecInfo = ns.xml_find_premis( root, "mets:amdSec[@ID='{}']".format(admID)) fileUUID = ns.xml_findtext_premis( amdSecInfo, "mets:techMD/mets:mdWrap/mets:xmlData/premis:object/premis:objectIdentifier/premis:objectIdentifierValue", ) # Index amdSec information xml = ElementTree.tostring(amdSecInfo) indexData["METS"]["amdSec"] = _rename_dict_keys_with_child_dicts( _normalize_dict_values(xmltodict.parse(xml))) # Get the parent division for the file pointer # by searching the physical structural map section (structMap) file_id = file_.attrib.get("ID", None) file_pointer_division = ns.xml_find_premis( root, "mets:structMap[@TYPE='physical']//mets:fptr[@FILEID='{}']/..". format(file_id), ) if file_pointer_division is not None: # If the parent division has a DMDID attribute then index # its data from the descriptive metadata section (dmdSec) dmd_section_id = file_pointer_division.attrib.get("DMDID", None) if dmd_section_id is not None: # dmd_section_id can contain one id (e.g., "dmdSec_2") # or more than one (e.g., "dmdSec_2 dmdSec_3", # when a file has both DC and non-DC metadata). # Attempt to index only the DC dmdSec if available for dmd_section_id_item in dmd_section_id.split(): dmd_section_info = ns.xml_find_premis( root, "mets:dmdSec[@ID='{}']/mets:mdWrap[@MDTYPE='DC']/mets:xmlData" .format(dmd_section_id_item), ) if dmd_section_info is not None: xml = ElementTree.tostring(dmd_section_info) data = _rename_dict_keys_with_child_dicts( _normalize_dict_values(xmltodict.parse(xml))) indexData["METS"]["dmdSec"] = data break indexData["FILEUUID"] = fileUUID # Get file path from FLocat and extension filePath = ns.xml_find_premis( file_, "mets:FLocat").attrib["{http://www.w3.org/1999/xlink}href"] indexData["filePath"] = filePath _, fileExtension = os.path.splitext(filePath) if fileExtension: indexData["fileExtension"] = fileExtension[1:].lower() # Index data _wait_for_cluster_yellow_status(client) _try_to_index(client, indexData, "aipfiles", printfn=printfn) # Reset fileData['METS']['amdSec'] and fileData['METS']['dmdSec'], # since they are updated in the loop above. # See http://stackoverflow.com/a/3975388 for explanation fileData["METS"]["amdSec"] = {} fileData["METS"]["dmdSec"] = {} return len(files)
def index_aip_and_files( client, uuid, aip_stored_path, mets_staging_path, name, aip_size, aips_in_aic=None, identifiers=[], encrypted=False, printfn=print, ): """Index AIP and AIP files with UUID `uuid` at path `path`. :param client: The ElasticSearch client. :param uuid: The UUID of the AIP we're indexing. :param aip_stored_path: path on disk where the AIP is located. :param mets_staging_path: path on disk where the AIP METS file is located. :param name: AIP name. :param aip_size: AIP size. :param aips_in_aic: optional number of AIPs stored in AIC. :param identifiers: optional additional identifiers (MODS, Islandora, etc.). :param identifiers: optional AIP encrypted boolean (defaults to `False`). :param printfn: optional print funtion. :return: 0 is succeded, 1 otherwise. """ # Stop if METS file is not at staging path. error_message = None if not os.path.exists(mets_staging_path): error_message = "METS file does not exist at: " + mets_staging_path if error_message: logger.error(error_message) printfn(error_message, file=sys.stderr) return 1 printfn("AIP UUID: " + uuid) printfn("Indexing AIP ...") tree = ElementTree.parse(mets_staging_path) _remove_tool_output_from_mets(tree) root = tree.getroot() # Extract AIC identifier, other specially-indexed information aic_identifier = None is_part_of = None dublincore = ns.xml_find_premis( root, "mets:dmdSec/mets:mdWrap/mets:xmlData/dcterms:dublincore") if dublincore is not None: aip_type = ns.xml_findtext_premis(dublincore, "dc:type") or ns.xml_findtext_premis( dublincore, "dcterms:type") if aip_type == "Archival Information Collection": aic_identifier = ns.xml_findtext_premis( dublincore, "dc:identifier") or ns.xml_findtext_premis( dublincore, "dcterms:identifier") is_part_of = ns.xml_findtext_premis(dublincore, "dcterms:isPartOf") # Convert METS XML to dict xml = ElementTree.tostring(root) mets_data = _rename_dict_keys_with_child_dicts( _normalize_dict_values(xmltodict.parse(xml))) # Pull the create time from the METS header. # Old METS did not use `metsHdr`. created = time.time() mets_hdr = ns.xml_find_premis(root, "mets:metsHdr") if mets_hdr is not None: mets_created_attr = mets_hdr.get("CREATEDATE") if mets_created_attr: try: created = calendar.timegm( time.strptime(mets_created_attr, "%Y-%m-%dT%H:%M:%S")) except ValueError: printfn("Failed to parse METS CREATEDATE: %s" % (mets_created_attr)) aip_data = { "uuid": uuid, "name": name, "filePath": aip_stored_path, "size": aip_size / (1024 * 1024), "mets": mets_data, "origin": get_dashboard_uuid(), "created": created, "AICID": aic_identifier, "isPartOf": is_part_of, "countAIPsinAIC": aips_in_aic, "identifiers": identifiers, "transferMetadata": _extract_transfer_metadata(root), "encrypted": encrypted, } _wait_for_cluster_yellow_status(client) _try_to_index(client, aip_data, "aips", printfn=printfn) printfn("Done.") printfn("Indexing AIP files ...") files_indexed = _index_aip_files( client=client, uuid=uuid, mets_path=mets_staging_path, name=name, identifiers=identifiers, printfn=printfn, ) printfn("Files indexed: " + str(files_indexed)) return 0
def update_rights(job, mets, sip_uuid, state): """ Add rightsMDs for updated PREMIS Rights. """ # Get original files to add rights to original_files = [f for f in mets.all_files() if f.use == "original"] # Check for deleted rights - exist in METS but not in DB # Cache rightsbasis in DB rightsmds_db = {} # memoize for rightsbasis in models.RightsStatement.RIGHTS_BASIS_CHOICES: # ORIGINAL RightsStatements are unrelated to the old one. rightsmds_db[rightsbasis[0]] = models.RightsStatement.objects.filter( metadataappliestoidentifier=sip_uuid, metadataappliestotype_id=createmets2.SIPMetadataAppliesToType, rightsbasis=rightsbasis[0], ).exclude(status=models.METADATA_STATUS_ORIGINAL) for fsentry in original_files: rightsmds = [ s for s in fsentry.amdsecs[0].subsections if s.subsection == "rightsMD" ] for r in rightsmds: # Don't follow MDRef pointers (see #1083 for more details). if isinstance(r.contents, metsrw.metadata.MDRef): continue if r.status == "superseded": continue rightsbasis = ns.xml_find_premis( r.contents.document, ".//premis:rightsBasis" ) if rightsbasis is None: continue basis = rightsbasis.text if basis == "Other": otherrightsbasis = ns.xml_find_premis( r.contents.document, ".//premis:otherRightsBasis" ) if otherrightsbasis is not None: basis = otherrightsbasis.text db_rights = rightsmds_db[basis] if ( not db_rights ): # TODO this may need to be more robust for RightsStatementRightsGranted job.pyprint("Rights", r.id_string, "looks deleted - making superseded") r.status = "superseded" # Check for newly added rights rights_list = models.RightsStatement.objects.filter( metadataappliestoidentifier=sip_uuid, metadataappliestotype_id=createmets2.SIPMetadataAppliesToType, status=models.METADATA_STATUS_ORIGINAL, ) if not rights_list: job.pyprint("No new rights added") else: add_rights_elements(job, rights_list, original_files, state) # Check for updated rights rights_list = models.RightsStatement.objects.filter( metadataappliestoidentifier=sip_uuid, metadataappliestotype_id=createmets2.SIPMetadataAppliesToType, status=models.METADATA_STATUS_UPDATED, ) if not rights_list: job.pyprint("No updated rights found") else: add_rights_elements(job, rights_list, original_files, state, updated=True) return mets