def create_dataverse_tabfile_dmdsec(job, sip_path, tabfile):
    """
    Returns dmdSec associated with the given tabfile, if one exists.
    """
    logger.info("Create Dataverse tabfile dmdsec %s", sip_path)
    # Retrieve METS.xml from the file system.
    metadata_mets_paths = archivematicaFunctions.find_metadata_files(
        sip_path, "METS.xml", only_transfers=True)
    if not metadata_mets_paths:
        return []
    ret = []
    for metadata_path in metadata_mets_paths:
        try:
            mets = metsrw.METSDocument.fromfile(metadata_path)
        except mets.MetsError:
            job.pyprint(
                "Could not parse external METS (Dataverse)",
                metadata_path,
                file=sys.stderr,
            )
            continue
        # Retrieve all Item DMDSecs from the METS.xml.
        for f in mets.all_files():
            if f.type == "Item" and f.path.endswith(tabfile):
                # Found the correct tabfile
                return [d.serialize() for d in f.dmdsecs]
    return ret
def create_dataverse_sip_dmdsec(job, sip_path):
    """
    Return SIP-level Dataverse dmdSecs for inclusion in the AIP METS.

    :param str sip_path: ...
    :return: List of dmdSec Elements
    """
    logger.info("Create dataverse sip dmdsec %s", sip_path)
    # Retrieve METS.xml from the file system.
    metadata_mets_paths = archivematicaFunctions.find_metadata_files(
        sip_path, "METS.xml", only_transfers=True)
    if not metadata_mets_paths:
        return []
    ret = []
    for metadata_path in metadata_mets_paths:
        try:
            mets = metsrw.METSDocument.fromfile(metadata_path)
        except mets.MetsError:
            job.pyprint(
                "Could not parse external METS (Dataverse)",
                metadata_path,
                file=sys.stderr,
            )
            continue
        # Retrieve all directory DMDSecs from the METS.xml.
        for f in mets.all_files():
            if f.type == "Directory" and f.dmdsecs:
                # Serialize
                ret += [d.serialize() for d in f.dmdsecs]
    return ret
def parseMetadata(SIPPath):
    """
    Parse all metadata.csv files in SIPPath.

    Looking for metadata.csvs in metadata/ and
    objects/metadata/transfers/<transfer name>/metadata/

    See parseMetadataCSV for details on parsing.

    :param SIPPath: Path to the SIP
    :return: {<filename>: OrderedDict(key: [values]) }
    """
    all_metadata = {}
    metadata_csvs = archivematicaFunctions.find_metadata_files(
        SIPPath, 'metadata.csv')

    for metadataCSVFilePath in metadata_csvs:
        try:
            csv_metadata = parseMetadataCSV(metadataCSVFilePath)
        except Exception:
            print >> sys.stderr, "error parsing: ", metadataCSVFilePath
            traceback.print_exc(file=sys.stderr)
            sharedVariablesAcrossModules.globalErrorCount += 1
            continue
        # Provide warning if this file already has differing metadata
        # Not using all_metadata.update(csv_metadata) because of that
        for entry, values in csv_metadata.iteritems():
            if entry in all_metadata and all_metadata[entry] != values:
                print >> sys.stderr, 'Metadata for', entry, 'being updated. Old:', all_metadata[
                    entry], 'New:', values
            existing = all_metadata.get(entry, collections.OrderedDict())
            existing.update(values)
            all_metadata[entry] = existing

    return all_metadata
def parseMetadata(job, SIPPath, state):
    """
    Parse all metadata.csv files in SIPPath.

    Looking for metadata.csvs in metadata/ and
    objects/metadata/transfers/<transfer name>/metadata/

    See parseMetadataCSV for details on parsing.

    :param SIPPath: Path to the SIP
    :return: {<filename>: OrderedDict(key: [values]) }
    """
    all_metadata = {}
    metadata_csvs = archivematicaFunctions.find_metadata_files(
        SIPPath, "metadata.csv")

    for metadataCSVFilePath in metadata_csvs:
        try:
            csv_metadata = parseMetadataCSV(job, metadataCSVFilePath)
        except Exception:
            job.pyprint("error parsing: ",
                        metadataCSVFilePath,
                        file=sys.stderr)
            job.print_error(traceback.format_exc())
            state.error_accumulator.error_count += 1
            continue
        # Provide warning if this file already has differing metadata
        # Not using all_metadata.update(csv_metadata) because of that
        for entry, values in csv_metadata.items():
            if entry in all_metadata and all_metadata[entry] != values:
                job.pyprint(
                    "Metadata for",
                    entry,
                    "being updated. Old:",
                    all_metadata[entry],
                    "New:",
                    values,
                    file=sys.stderr,
                )
            existing = all_metadata.get(entry, collections.OrderedDict())
            existing.update(values)
            all_metadata[entry] = existing

    return all_metadata
def parse_archivesspace_ids(sip_path, sip_uuid):
    """
    Parse an archivesspaceids.csv to pre-populate the matching GUI.

    :param sip_path: Path to the SIP to check for an archivesspaceids.csv
    :param sip_uuid: UUID of the SIP to auto-populate ArchivesSpace IDs for
    :return: 0 on success, 1 on failure
    """
    # Check for archivesspaceids.csv
    csv_paths = archivematicaFunctions.find_metadata_files(
        sip_path, 'archivesspaceids.csv')
    if not csv_paths:
        print('No archivesspaceids.csv files found, exiting')
        return 0

    file_info = parse_archivesspaceids_csv(csv_paths)
    if not file_info:
        print('No information found in archivesspaceids.csv files')
        return 1
    print(file_info)

    # Create client
    client = create_archivesspace_client()
    if not client:
        return 1

    for filename, ref_id in file_info.items():
        # Get file object (for fileUUID, to see if in DIP)
        print(filename, ref_id, '%SIPLocation%' + filename)
        try:

            f = models.File.objects.get(
                Q(originallocation='%transferDirectory%' + filename)
                | Q(originallocation='%transferDirectory%objects/' + filename)
                | Q(originallocation='%SIPDirectory%' + filename)
                | Q(originallocation='%SIPDirectory%objects/' + filename),
                sip_id=sip_uuid)
        except models.File.DoesNotExist:
            print(filename, 'not found in database, skipping')
            continue
        except models.File.MultipleObjectsReturned:
            print('Multiple entries for', filename,
                  'found in database, skipping')
            continue
        print('File:', f)

        # Query ref_id to client for resource_id
        resource = client.find_by_id('archival_objects', 'ref_id', ref_id)
        try:
            resource_id = resource[0]['id']
        except IndexError:
            print('ArchivesSpace did not return an ID for', ref_id)
            print('Returned', resource)
            continue
        print('Resource ID:', resource_id)

        # Add to ArchivesSpaceDIPObjectResourcePairing
        models.ArchivesSpaceDIPObjectResourcePairing.objects.create(
            dipuuid=sip_uuid,
            fileuuid=f.uuid,
            resourceid=resource_id,
        )

    # Check if any files were processed?
    return 0
示例#6
0
def parse_archivesspace_ids(sip_path, sip_uuid):
    """
    Parse an archivesspaceids.csv to pre-populate the matching GUI.

    :param sip_path: Path to the SIP to check for an archivesspaceids.csv
    :param sip_uuid: UUID of the SIP to auto-populate ArchivesSpace IDs for
    :return: 0 on success, 1 on failure
    """
    # Check for archivesspaceids.csv
    csv_paths = archivematicaFunctions.find_metadata_files(
        sip_path, "archivesspaceids.csv")
    if not csv_paths:
        logger.info("No archivesspaceids.csv files found, exiting")
        return 0

    file_info = parse_archivesspaceids_csv(csv_paths)
    if not file_info:
        logger.info("No information found in archivesspaceids.csv files")
        return 1

    logger.info("File info: %s", file_info)

    # Create client
    client = create_archivesspace_client()
    if not client:
        return 1

    for filename, ref_id in file_info.items():
        # Get file object (for fileUUID, to see if in DIP)
        logger.debug('Getting file object: filename="%s" ref_id="%s"',
                     filename, ref_id)
        try:

            f = models.File.objects.get(
                Q(originallocation="%transferDirectory%" + filename)
                | Q(originallocation="%transferDirectory%objects/" + filename)
                | Q(originallocation="%SIPDirectory%" + filename)
                | Q(originallocation="%SIPDirectory%objects/" + filename),
                sip_id=sip_uuid,
            )
        except models.File.DoesNotExist:
            logger.error("%s not found in database, skipping", filename)
            continue
        except models.File.MultipleObjectsReturned:
            logger.error("Multiple entries for %s found in database, skipping",
                         filename)
            continue
        logger.debug("File: %s", f)

        # Query ref_id to client for resource_id
        resource = client.find_by_id("archival_objects", "ref_id", ref_id)
        try:
            resource_id = resource[0]["id"]
        except IndexError:
            logger.error("ArchivesSpace did not return an ID for %s", ref_id)
            logger.error("Returned %s", resource)
            continue
        logger.debug("Resource ID: %s", resource_id)

        # Add to ArchivesSpaceDIPObjectResourcePairing
        models.ArchivesSpaceDIPObjectResourcePairing.objects.create(
            dipuuid=sip_uuid, fileuuid=f.uuid, resourceid=resource_id)

    # Check if any files were processed?
    return 0