Exemplo n.º 1
0
def _extract_transfer_metadata(doc):
    return [
        xmltodict.parse(ElementTree.tostring(el))['transfer_metadata']
        for el in doc.findall(
            "mets:amdSec/mets:sourceMD/mets:mdWrap/mets:xmlData/transfer_metadata",
            namespaces=ns.NSMAP)
    ]
Exemplo n.º 2
0
def _extract_transfer_metadata(doc):
    return [
        xmltodict.parse(ElementTree.tostring(el))["transfer_metadata"]
        for el in ns.xml_findall_premis(
            doc,
            "mets:amdSec/mets:sourceMD/mets:mdWrap/mets:xmlData/transfer_metadata"
        )
    ]
def index_aip(client,
              uuid,
              name,
              filePath,
              pathToMETS,
              size=None,
              aips_in_aic=None,
              identifiers=[]):
    tree = ElementTree.parse(pathToMETS)

    # TODO add a conditional to toggle this
    remove_tool_output_from_mets(tree)

    root = tree.getroot()
    # Extract AIC identifier, other specially-indexed information
    aic_identifier = None
    is_part_of = None
    dublincore = root.find(
        'mets:dmdSec/mets:mdWrap/mets:xmlData/dcterms:dublincore',
        namespaces=ns.NSMAP)
    if dublincore is not None:
        aip_type = dublincore.findtext(
            'dc:type', namespaces=ns.NSMAP) or dublincore.findtext(
                'dcterms:type', namespaces=ns.NSMAP)
        if aip_type == "Archival Information Collection":
            aic_identifier = dublincore.findtext(
                'dc:identifier', namespaces=ns.NSMAP) or dublincore.findtext(
                    'dcterms:identifier', namespaces=ns.NSMAP)
        is_part_of = dublincore.findtext('dcterms:isPartOf',
                                         namespaces=ns.NSMAP)

    # convert METS XML to dict
    xml = ElementTree.tostring(root)
    mets_data = rename_dict_keys_with_child_dicts(
        normalize_dict_values(xmltodict.parse(xml)))

    aipData = {
        'uuid': uuid,
        'name': name,
        'filePath': filePath,
        'size': (size or os.path.getsize(filePath)) / 1024 / 1024,
        'mets': mets_data,
        'origin': get_dashboard_uuid(),
        'created': os.path.getmtime(pathToMETS),
        'AICID': aic_identifier,
        'isPartOf': is_part_of,
        'countAIPsinAIC': aips_in_aic,
        'identifiers': identifiers,
        'transferMetadata': _extract_transfer_metadata(root),
    }
    wait_for_cluster_yellow_status(client)
    try_to_index(client, aipData, 'aips', 'aip')
Exemplo n.º 4
0
def index_aip(client, uuid, name, filePath, pathToMETS, size=None, aips_in_aic=None, identifiers=[], encrypted=False):

    tree = ElementTree.parse(pathToMETS)

    # TODO add a conditional to toggle this
    remove_tool_output_from_mets(tree)

    root = tree.getroot()
    # Extract AIC identifier, other specially-indexed information
    aic_identifier = None
    is_part_of = None
    dublincore = root.find('mets:dmdSec/mets:mdWrap/mets:xmlData/dcterms:dublincore', namespaces=ns.NSMAP)
    if dublincore is not None:
        aip_type = dublincore.findtext('dc:type', namespaces=ns.NSMAP) or dublincore.findtext('dcterms:type', namespaces=ns.NSMAP)
        if aip_type == "Archival Information Collection":
            aic_identifier = dublincore.findtext('dc:identifier', namespaces=ns.NSMAP) or dublincore.findtext('dcterms:identifier', namespaces=ns.NSMAP)
        is_part_of = dublincore.findtext('dcterms:isPartOf', namespaces=ns.NSMAP)

    # convert METS XML to dict
    xml = ElementTree.tostring(root)
    mets_data = rename_dict_keys_with_child_dicts(normalize_dict_values(xmltodict.parse(xml)))

    # Pull the create time from the METS header
    mets_hdr = root.find("mets:metsHdr", namespaces=ns.NSMAP)
    mets_created_attr = mets_hdr.get('CREATEDATE')

    created = time.time()

    if mets_created_attr:
        try:
            created = calendar.timegm(time.strptime(mets_created_attr, '%Y-%m-%dT%H:%M:%S'))
        except ValueError:
            print("Failed to parse METS CREATEDATE: %s" % (mets_created_attr))

    aipData = {
        'uuid': uuid,
        'name': name,
        'filePath': filePath,
        'size': (size or os.path.getsize(filePath)) / 1024 / 1024,
        'mets': mets_data,
        'origin': get_dashboard_uuid(),
        'created': created,
        'AICID': aic_identifier,
        'isPartOf': is_part_of,
        'countAIPsinAIC': aips_in_aic,
        'identifiers': identifiers,
        'transferMetadata': _extract_transfer_metadata(root),
        'encrypted': encrypted
    }
    wait_for_cluster_yellow_status(client)
    try_to_index(client, aipData, 'aips', 'aip')
Exemplo n.º 5
0
def index_mets_file_metadata(client,
                             uuid,
                             metsFilePath,
                             index,
                             type_,
                             sipName,
                             identifiers=[]):
    # parse XML
    tree = ElementTree.parse(metsFilePath)
    root = tree.getroot()

    # TODO add a conditional to toggle this
    remove_tool_output_from_mets(tree)

    # get SIP-wide dmdSec
    dmdSec = root.findall("mets:dmdSec/mets:mdWrap/mets:xmlData",
                          namespaces=ns.NSMAP)
    dmdSecData = {}
    for item in dmdSec:
        xml = ElementTree.tostring(item)
        dmdSecData = xmltodict.parse(xml)

    # Extract isPartOf (for AIPs) or identifier (for AICs) from DublinCore
    dublincore = root.find(
        'mets:dmdSec/mets:mdWrap/mets:xmlData/dcterms:dublincore',
        namespaces=ns.NSMAP)
    aic_identifier = None
    is_part_of = None
    if dublincore is not None:
        aip_type = dublincore.findtext(
            'dc:type', namespaces=ns.NSMAP) or dublincore.findtext(
                'dcterms:type', namespaces=ns.NSMAP)
        if aip_type == "Archival Information Collection":
            aic_identifier = dublincore.findtext(
                'dc:identifier', namespaces=ns.NSMAP) or dublincore.findtext(
                    'dcterms:identifier', namespaces=ns.NSMAP)
        elif aip_type == "Archival Information Package":
            is_part_of = dublincore.findtext('dcterms:isPartOf',
                                             namespaces=ns.NSMAP)

    # establish structure to be indexed for each file item
    fileData = {
        'archivematicaVersion': version.get_version(),
        'AIPUUID': uuid,
        'sipName': sipName,
        'FILEUUID': '',
        'indexedAt': time.time(),
        'filePath': '',
        'fileExtension': '',
        'isPartOf': is_part_of,
        'AICID': aic_identifier,
        'METS': {
            'dmdSec':
            rename_dict_keys_with_child_dicts(
                normalize_dict_values(dmdSecData)),
            'amdSec': {},
        },
        'origin': get_dashboard_uuid(),
        'identifiers': identifiers,
        'transferMetadata': _extract_transfer_metadata(root),
    }

    # Index all files in a fileGrup with USE='original' or USE='metadata'
    original_files = root.findall(
        "mets:fileSec/mets:fileGrp[@USE='original']/mets:file",
        namespaces=ns.NSMAP)
    metadata_files = root.findall(
        "mets:fileSec/mets:fileGrp[@USE='metadata']/mets:file",
        namespaces=ns.NSMAP)
    files = original_files + metadata_files

    # Index AIC METS file if it exists
    for file_ in files:
        indexData = fileData.copy()  # Deep copy of dict, not of dict contents

        # Get file UUID.  If and ADMID exists, look in the amdSec for the UUID,
        # otherwise parse it out of the file ID.
        # 'Original' files have ADMIDs, 'Metadata' files don't
        admID = file_.attrib.get('ADMID', None)
        if admID is None:
            # Parse UUID from file ID
            fileUUID = None
            uuix_regex = r'\w{8}-?\w{4}-?\w{4}-?\w{4}-?\w{12}'
            uuids = re.findall(uuix_regex, file_.attrib['ID'])
            # Multiple UUIDs may be returned - if they are all identical, use that
            # UUID, otherwise use None.
            # To determine all UUIDs are identical, use the size of the set
            if len(set(uuids)) == 1:
                fileUUID = uuids[0]
        else:
            amdSecInfo = root.find("mets:amdSec[@ID='{}']".format(admID),
                                   namespaces=ns.NSMAP)
            fileUUID = amdSecInfo.findtext(
                "mets:techMD/mets:mdWrap/mets:xmlData/premis:object/premis:objectIdentifier/premis:objectIdentifierValue",
                namespaces=ns.NSMAP)

            # Index amdSec information
            xml = ElementTree.tostring(amdSecInfo)
            indexData['METS']['amdSec'] = rename_dict_keys_with_child_dicts(
                normalize_dict_values(xmltodict.parse(xml)))

        indexData['FILEUUID'] = fileUUID

        # Get file path from FLocat and extension
        filePath = file_.find(
            'mets:FLocat',
            namespaces=ns.NSMAP).attrib['{http://www.w3.org/1999/xlink}href']
        indexData['filePath'] = filePath
        _, fileExtension = os.path.splitext(filePath)
        if fileExtension:
            indexData['fileExtension'] = fileExtension[1:].lower()

        # index data
        wait_for_cluster_yellow_status(client)
        try_to_index(client, indexData, index, type_)

        # Reset fileData['METS']['amdSec'], since it is updated in the loop
        # above. See http://stackoverflow.com/a/3975388 for explanation
        fileData['METS']['amdSec'] = {}

    print('Indexed AIP files and corresponding METS XML.')

    return len(files)
Exemplo n.º 6
0
def _index_aip_files(client,
                     uuid,
                     mets_path,
                     name,
                     identifiers=[],
                     printfn=print):
    """Index AIP files from AIP with UUID `uuid` and METS at path `mets_path`.

    :param client: The ElasticSearch client.
    :param uuid: The UUID of the AIP we're indexing.
    :param mets_path: path on disk where the AIP's METS file is located.
    :param name: AIP name.
    :param identifiers: optional additional identifiers (MODS, Islandora, etc.).
    :param printfn: optional print funtion.
    :return: number of files indexed.
    """
    # Parse XML
    tree = ElementTree.parse(mets_path)
    root = tree.getroot()

    # TODO: Add a conditional to toggle this
    _remove_tool_output_from_mets(tree)

    # Extract isPartOf (for AIPs) or identifier (for AICs) from DublinCore
    dublincore = root.find(
        'mets:dmdSec/mets:mdWrap/mets:xmlData/dcterms:dublincore',
        namespaces=ns.NSMAP)
    aic_identifier = None
    is_part_of = None
    if dublincore is not None:
        aip_type = dublincore.findtext(
            'dc:type', namespaces=ns.NSMAP) or dublincore.findtext(
                'dcterms:type', namespaces=ns.NSMAP)
        if aip_type == "Archival Information Collection":
            aic_identifier = dublincore.findtext(
                'dc:identifier', namespaces=ns.NSMAP) or dublincore.findtext(
                    'dcterms:identifier', namespaces=ns.NSMAP)
        elif aip_type == "Archival Information Package":
            is_part_of = dublincore.findtext('dcterms:isPartOf',
                                             namespaces=ns.NSMAP)

    # Establish structure to be indexed for each file item
    fileData = {
        'archivematicaVersion': version.get_version(),
        'AIPUUID': uuid,
        'sipName': name,
        'FILEUUID': '',
        'indexedAt': time.time(),
        'filePath': '',
        'fileExtension': '',
        'isPartOf': is_part_of,
        'AICID': aic_identifier,
        'METS': {
            'dmdSec': {},
            'amdSec': {},
        },
        'origin': get_dashboard_uuid(),
        'identifiers': identifiers,
        'transferMetadata': _extract_transfer_metadata(root),
    }

    # Index all files in a fileGrup with USE='original' or USE='metadata'
    original_files = root.findall(
        "mets:fileSec/mets:fileGrp[@USE='original']/mets:file",
        namespaces=ns.NSMAP)
    metadata_files = root.findall(
        "mets:fileSec/mets:fileGrp[@USE='metadata']/mets:file",
        namespaces=ns.NSMAP)
    files = original_files + metadata_files

    # Index AIC METS file if it exists
    for file_ in files:
        indexData = fileData.copy()  # Deep copy of dict, not of dict contents

        # Get file UUID.  If and ADMID exists, look in the amdSec for the UUID,
        # otherwise parse it out of the file ID.
        # 'Original' files have ADMIDs, 'Metadata' files don't
        admID = file_.attrib.get('ADMID', None)
        if admID is None:
            # Parse UUID from file ID
            fileUUID = None
            uuix_regex = r'\w{8}-?\w{4}-?\w{4}-?\w{4}-?\w{12}'
            uuids = re.findall(uuix_regex, file_.attrib['ID'])
            # Multiple UUIDs may be returned - if they are all identical, use that
            # UUID, otherwise use None.
            # To determine all UUIDs are identical, use the size of the set
            if len(set(uuids)) == 1:
                fileUUID = uuids[0]
        else:
            amdSecInfo = root.find("mets:amdSec[@ID='{}']".format(admID),
                                   namespaces=ns.NSMAP)
            fileUUID = amdSecInfo.findtext(
                "mets:techMD/mets:mdWrap/mets:xmlData/premis:object/premis:objectIdentifier/premis:objectIdentifierValue",
                namespaces=ns.NSMAP)

            # Index amdSec information
            xml = ElementTree.tostring(amdSecInfo)
            indexData['METS']['amdSec'] = _rename_dict_keys_with_child_dicts(
                _normalize_dict_values(xmltodict.parse(xml)))

        # Get the parent division for the file pointer
        # by searching the physical structural map section (structMap)
        file_id = file_.attrib.get('ID', None)
        file_pointer_division = root.find(
            "mets:structMap[@TYPE='physical']//mets:fptr[@FILEID='{}']/..".
            format(file_id),
            namespaces=ns.NSMAP)
        if file_pointer_division is not None:
            # If the parent division has a DMDID attribute then index
            # its data from the descriptive metadata section (dmdSec)
            dmd_section_id = file_pointer_division.attrib.get('DMDID', None)
            if dmd_section_id is not None:
                # dmd_section_id can contain one id (e.g., "dmdSec_2")
                # or more than one (e.g., "dmdSec_2 dmdSec_3",
                # when a file has both DC and non-DC metadata).
                # Attempt to index only the DC dmdSec if available
                for dmd_section_id_item in dmd_section_id.split():
                    dmd_section_info = root.find(
                        "mets:dmdSec[@ID='{}']/mets:mdWrap[@MDTYPE='DC']/mets:xmlData"
                        .format(dmd_section_id_item),
                        namespaces=ns.NSMAP)
                    if dmd_section_info is not None:
                        xml = ElementTree.tostring(dmd_section_info)
                        data = _rename_dict_keys_with_child_dicts(
                            _normalize_dict_values(xmltodict.parse(xml)))
                        indexData["METS"]["dmdSec"] = data
                        break

        indexData['FILEUUID'] = fileUUID

        # Get file path from FLocat and extension
        filePath = file_.find(
            'mets:FLocat',
            namespaces=ns.NSMAP).attrib['{http://www.w3.org/1999/xlink}href']
        indexData['filePath'] = filePath
        _, fileExtension = os.path.splitext(filePath)
        if fileExtension:
            indexData['fileExtension'] = fileExtension[1:].lower()

        # Index data
        _wait_for_cluster_yellow_status(client)
        _try_to_index(client, indexData, 'aipfiles', printfn=printfn)

        # Reset fileData['METS']['amdSec'] and fileData['METS']['dmdSec'],
        # since they are updated in the loop above.
        # See http://stackoverflow.com/a/3975388 for explanation
        fileData['METS']['amdSec'] = {}
        fileData['METS']['dmdSec'] = {}

    return len(files)
Exemplo n.º 7
0
def index_aip_and_files(client,
                        uuid,
                        aip_stored_path,
                        mets_staging_path,
                        name,
                        aip_size,
                        aips_in_aic=None,
                        identifiers=[],
                        encrypted=False,
                        printfn=print):
    """Index AIP and AIP files with UUID `uuid` at path `path`.

    :param client: The ElasticSearch client.
    :param uuid: The UUID of the AIP we're indexing.
    :param aip_stored_path: path on disk where the AIP is located.
    :param mets_staging_path: path on disk where the AIP METS file is located.
    :param name: AIP name.
    :param aip_size: AIP size.
    :param aips_in_aic: optional number of AIPs stored in AIC.
    :param identifiers: optional additional identifiers (MODS, Islandora, etc.).
    :param identifiers: optional AIP encrypted boolean (defaults to `False`).
    :param printfn: optional print funtion.
    :return: 0 is succeded, 1 otherwise.
    """
    # Stop if AIP or METS file don't not exist.
    error_message = None
    if not os.path.exists(mets_staging_path):
        error_message = 'METS file does not exist at: ' + mets_staging_path
    if error_message:
        logger.error(error_message)
        printfn(error_message, file=sys.stderr)
        return 1
    printfn('AIP UUID: ' + uuid)
    printfn('Indexing AIP ...')
    tree = ElementTree.parse(mets_staging_path)
    _remove_tool_output_from_mets(tree)
    root = tree.getroot()
    # Extract AIC identifier, other specially-indexed information
    aic_identifier = None
    is_part_of = None
    dublincore = root.find(
        'mets:dmdSec/mets:mdWrap/mets:xmlData/dcterms:dublincore',
        namespaces=ns.NSMAP)
    if dublincore is not None:
        aip_type = dublincore.findtext(
            'dc:type', namespaces=ns.NSMAP) or dublincore.findtext(
                'dcterms:type', namespaces=ns.NSMAP)
        if aip_type == 'Archival Information Collection':
            aic_identifier = dublincore.findtext(
                'dc:identifier', namespaces=ns.NSMAP) or dublincore.findtext(
                    'dcterms:identifier', namespaces=ns.NSMAP)
        is_part_of = dublincore.findtext('dcterms:isPartOf',
                                         namespaces=ns.NSMAP)
    # Convert METS XML to dict
    xml = ElementTree.tostring(root)
    mets_data = _rename_dict_keys_with_child_dicts(
        _normalize_dict_values(xmltodict.parse(xml)))
    # Pull the create time from the METS header
    mets_hdr = root.find('mets:metsHdr', namespaces=ns.NSMAP)
    mets_created_attr = mets_hdr.get('CREATEDATE')
    created = time.time()
    if mets_created_attr:
        try:
            created = calendar.timegm(
                time.strptime(mets_created_attr, '%Y-%m-%dT%H:%M:%S'))
        except ValueError:
            printfn('Failed to parse METS CREATEDATE: %s' %
                    (mets_created_attr))
    aip_data = {
        'uuid': uuid,
        'name': name,
        'filePath': aip_stored_path,
        'size': aip_size / (1024 * 1024),
        'mets': mets_data,
        'origin': get_dashboard_uuid(),
        'created': created,
        'AICID': aic_identifier,
        'isPartOf': is_part_of,
        'countAIPsinAIC': aips_in_aic,
        'identifiers': identifiers,
        'transferMetadata': _extract_transfer_metadata(root),
        'encrypted': encrypted
    }
    _wait_for_cluster_yellow_status(client)
    _try_to_index(client, aip_data, 'aips', printfn=printfn)
    printfn('Done.')
    printfn('Indexing AIP files ...')
    files_indexed = _index_aip_files(
        client=client,
        uuid=uuid,
        mets_path=mets_staging_path,
        name=name,
        identifiers=identifiers,
        printfn=printfn,
    )
    printfn('Files indexed: ' + str(files_indexed))
    return 0
Exemplo n.º 8
0
def index_aip_and_files(
    client,
    uuid,
    aip_stored_path,
    mets_staging_path,
    name,
    aip_size,
    aips_in_aic=None,
    identifiers=[],
    encrypted=False,
    printfn=print,
):
    """Index AIP and AIP files with UUID `uuid` at path `path`.

    :param client: The ElasticSearch client.
    :param uuid: The UUID of the AIP we're indexing.
    :param aip_stored_path: path on disk where the AIP is located.
    :param mets_staging_path: path on disk where the AIP METS file is located.
    :param name: AIP name.
    :param aip_size: AIP size.
    :param aips_in_aic: optional number of AIPs stored in AIC.
    :param identifiers: optional additional identifiers (MODS, Islandora, etc.).
    :param identifiers: optional AIP encrypted boolean (defaults to `False`).
    :param printfn: optional print funtion.
    :return: 0 is succeded, 1 otherwise.
    """
    # Stop if METS file is not at staging path.
    error_message = None
    if not os.path.exists(mets_staging_path):
        error_message = "METS file does not exist at: " + mets_staging_path
    if error_message:
        logger.error(error_message)
        printfn(error_message, file=sys.stderr)
        return 1
    printfn("AIP UUID: " + uuid)
    printfn("Indexing AIP ...")
    tree = ElementTree.parse(mets_staging_path)
    _remove_tool_output_from_mets(tree)
    root = tree.getroot()
    # Extract AIC identifier, other specially-indexed information
    aic_identifier = None
    is_part_of = None
    dublincore = ns.xml_find_premis(
        root, "mets:dmdSec/mets:mdWrap/mets:xmlData/dcterms:dublincore")
    if dublincore is not None:
        aip_type = ns.xml_findtext_premis(dublincore,
                                          "dc:type") or ns.xml_findtext_premis(
                                              dublincore, "dcterms:type")
        if aip_type == "Archival Information Collection":
            aic_identifier = ns.xml_findtext_premis(
                dublincore, "dc:identifier") or ns.xml_findtext_premis(
                    dublincore, "dcterms:identifier")
        is_part_of = ns.xml_findtext_premis(dublincore, "dcterms:isPartOf")

    # Convert METS XML to dict
    xml = ElementTree.tostring(root)
    mets_data = _rename_dict_keys_with_child_dicts(
        _normalize_dict_values(xmltodict.parse(xml)))

    # Pull the create time from the METS header.
    # Old METS did not use `metsHdr`.
    created = time.time()
    mets_hdr = ns.xml_find_premis(root, "mets:metsHdr")
    if mets_hdr is not None:
        mets_created_attr = mets_hdr.get("CREATEDATE")
        if mets_created_attr:
            try:
                created = calendar.timegm(
                    time.strptime(mets_created_attr, "%Y-%m-%dT%H:%M:%S"))
            except ValueError:
                printfn("Failed to parse METS CREATEDATE: %s" %
                        (mets_created_attr))

    aip_data = {
        "uuid": uuid,
        "name": name,
        "filePath": aip_stored_path,
        "size": aip_size / (1024 * 1024),
        "mets": mets_data,
        "origin": get_dashboard_uuid(),
        "created": created,
        "AICID": aic_identifier,
        "isPartOf": is_part_of,
        "countAIPsinAIC": aips_in_aic,
        "identifiers": identifiers,
        "transferMetadata": _extract_transfer_metadata(root),
        "encrypted": encrypted,
    }
    _wait_for_cluster_yellow_status(client)
    _try_to_index(client, aip_data, "aips", printfn=printfn)
    printfn("Done.")
    printfn("Indexing AIP files ...")
    files_indexed = _index_aip_files(
        client=client,
        uuid=uuid,
        mets_path=mets_staging_path,
        name=name,
        identifiers=identifiers,
        printfn=printfn,
    )
    printfn("Files indexed: " + str(files_indexed))
    return 0