def create_dataverse_tabfile_dmdsec(job, sip_path, tabfile): """ Returns dmdSec associated with the given tabfile, if one exists. """ logger.info("Create Dataverse tabfile dmdsec %s", sip_path) # Retrieve METS.xml from the file system. metadata_mets_paths = archivematicaFunctions.find_metadata_files( sip_path, "METS.xml", only_transfers=True) if not metadata_mets_paths: return [] ret = [] for metadata_path in metadata_mets_paths: try: mets = metsrw.METSDocument.fromfile(metadata_path) except mets.MetsError: job.pyprint( "Could not parse external METS (Dataverse)", metadata_path, file=sys.stderr, ) continue # Retrieve all Item DMDSecs from the METS.xml. for f in mets.all_files(): if f.type == "Item" and f.path.endswith(tabfile): # Found the correct tabfile return [d.serialize() for d in f.dmdsecs] return ret
def create_dataverse_sip_dmdsec(job, sip_path): """ Return SIP-level Dataverse dmdSecs for inclusion in the AIP METS. :param str sip_path: ... :return: List of dmdSec Elements """ logger.info("Create dataverse sip dmdsec %s", sip_path) # Retrieve METS.xml from the file system. metadata_mets_paths = archivematicaFunctions.find_metadata_files( sip_path, "METS.xml", only_transfers=True) if not metadata_mets_paths: return [] ret = [] for metadata_path in metadata_mets_paths: try: mets = metsrw.METSDocument.fromfile(metadata_path) except mets.MetsError: job.pyprint( "Could not parse external METS (Dataverse)", metadata_path, file=sys.stderr, ) continue # Retrieve all directory DMDSecs from the METS.xml. for f in mets.all_files(): if f.type == "Directory" and f.dmdsecs: # Serialize ret += [d.serialize() for d in f.dmdsecs] return ret
def parseMetadata(SIPPath): """ Parse all metadata.csv files in SIPPath. Looking for metadata.csvs in metadata/ and objects/metadata/transfers/<transfer name>/metadata/ See parseMetadataCSV for details on parsing. :param SIPPath: Path to the SIP :return: {<filename>: OrderedDict(key: [values]) } """ all_metadata = {} metadata_csvs = archivematicaFunctions.find_metadata_files( SIPPath, 'metadata.csv') for metadataCSVFilePath in metadata_csvs: try: csv_metadata = parseMetadataCSV(metadataCSVFilePath) except Exception: print >> sys.stderr, "error parsing: ", metadataCSVFilePath traceback.print_exc(file=sys.stderr) sharedVariablesAcrossModules.globalErrorCount += 1 continue # Provide warning if this file already has differing metadata # Not using all_metadata.update(csv_metadata) because of that for entry, values in csv_metadata.iteritems(): if entry in all_metadata and all_metadata[entry] != values: print >> sys.stderr, 'Metadata for', entry, 'being updated. Old:', all_metadata[ entry], 'New:', values existing = all_metadata.get(entry, collections.OrderedDict()) existing.update(values) all_metadata[entry] = existing return all_metadata
def parseMetadata(job, SIPPath, state): """ Parse all metadata.csv files in SIPPath. Looking for metadata.csvs in metadata/ and objects/metadata/transfers/<transfer name>/metadata/ See parseMetadataCSV for details on parsing. :param SIPPath: Path to the SIP :return: {<filename>: OrderedDict(key: [values]) } """ all_metadata = {} metadata_csvs = archivematicaFunctions.find_metadata_files( SIPPath, "metadata.csv") for metadataCSVFilePath in metadata_csvs: try: csv_metadata = parseMetadataCSV(job, metadataCSVFilePath) except Exception: job.pyprint("error parsing: ", metadataCSVFilePath, file=sys.stderr) job.print_error(traceback.format_exc()) state.error_accumulator.error_count += 1 continue # Provide warning if this file already has differing metadata # Not using all_metadata.update(csv_metadata) because of that for entry, values in csv_metadata.items(): if entry in all_metadata and all_metadata[entry] != values: job.pyprint( "Metadata for", entry, "being updated. Old:", all_metadata[entry], "New:", values, file=sys.stderr, ) existing = all_metadata.get(entry, collections.OrderedDict()) existing.update(values) all_metadata[entry] = existing return all_metadata
def parse_archivesspace_ids(sip_path, sip_uuid): """ Parse an archivesspaceids.csv to pre-populate the matching GUI. :param sip_path: Path to the SIP to check for an archivesspaceids.csv :param sip_uuid: UUID of the SIP to auto-populate ArchivesSpace IDs for :return: 0 on success, 1 on failure """ # Check for archivesspaceids.csv csv_paths = archivematicaFunctions.find_metadata_files( sip_path, 'archivesspaceids.csv') if not csv_paths: print('No archivesspaceids.csv files found, exiting') return 0 file_info = parse_archivesspaceids_csv(csv_paths) if not file_info: print('No information found in archivesspaceids.csv files') return 1 print(file_info) # Create client client = create_archivesspace_client() if not client: return 1 for filename, ref_id in file_info.items(): # Get file object (for fileUUID, to see if in DIP) print(filename, ref_id, '%SIPLocation%' + filename) try: f = models.File.objects.get( Q(originallocation='%transferDirectory%' + filename) | Q(originallocation='%transferDirectory%objects/' + filename) | Q(originallocation='%SIPDirectory%' + filename) | Q(originallocation='%SIPDirectory%objects/' + filename), sip_id=sip_uuid) except models.File.DoesNotExist: print(filename, 'not found in database, skipping') continue except models.File.MultipleObjectsReturned: print('Multiple entries for', filename, 'found in database, skipping') continue print('File:', f) # Query ref_id to client for resource_id resource = client.find_by_id('archival_objects', 'ref_id', ref_id) try: resource_id = resource[0]['id'] except IndexError: print('ArchivesSpace did not return an ID for', ref_id) print('Returned', resource) continue print('Resource ID:', resource_id) # Add to ArchivesSpaceDIPObjectResourcePairing models.ArchivesSpaceDIPObjectResourcePairing.objects.create( dipuuid=sip_uuid, fileuuid=f.uuid, resourceid=resource_id, ) # Check if any files were processed? return 0
def parse_archivesspace_ids(sip_path, sip_uuid): """ Parse an archivesspaceids.csv to pre-populate the matching GUI. :param sip_path: Path to the SIP to check for an archivesspaceids.csv :param sip_uuid: UUID of the SIP to auto-populate ArchivesSpace IDs for :return: 0 on success, 1 on failure """ # Check for archivesspaceids.csv csv_paths = archivematicaFunctions.find_metadata_files( sip_path, "archivesspaceids.csv") if not csv_paths: logger.info("No archivesspaceids.csv files found, exiting") return 0 file_info = parse_archivesspaceids_csv(csv_paths) if not file_info: logger.info("No information found in archivesspaceids.csv files") return 1 logger.info("File info: %s", file_info) # Create client client = create_archivesspace_client() if not client: return 1 for filename, ref_id in file_info.items(): # Get file object (for fileUUID, to see if in DIP) logger.debug('Getting file object: filename="%s" ref_id="%s"', filename, ref_id) try: f = models.File.objects.get( Q(originallocation="%transferDirectory%" + filename) | Q(originallocation="%transferDirectory%objects/" + filename) | Q(originallocation="%SIPDirectory%" + filename) | Q(originallocation="%SIPDirectory%objects/" + filename), sip_id=sip_uuid, ) except models.File.DoesNotExist: logger.error("%s not found in database, skipping", filename) continue except models.File.MultipleObjectsReturned: logger.error("Multiple entries for %s found in database, skipping", filename) continue logger.debug("File: %s", f) # Query ref_id to client for resource_id resource = client.find_by_id("archival_objects", "ref_id", ref_id) try: resource_id = resource[0]["id"] except IndexError: logger.error("ArchivesSpace did not return an ID for %s", ref_id) logger.error("Returned %s", resource) continue logger.debug("Resource ID: %s", resource_id) # Add to ArchivesSpaceDIPObjectResourcePairing models.ArchivesSpaceDIPObjectResourcePairing.objects.create( dipuuid=sip_uuid, fileuuid=f.uuid, resourceid=resource_id) # Check if any files were processed? return 0