Пример #1
0
def filter_packages(
    package_list,
    statuses=("UPLOADED", "DEL_REQ"),
    package_types=("AIP", "AIC", "transfer", "DIP"),
    pipeline_uuid=None,
    filter_replicas=False,
):
    """Filter packages by status and origin pipeline.

    :param package_list: List of package info returned by Storage
    Service (list).
    :param statuses: Acceptable statuses for filter (tuple). Defaults
    to filtering out deleted packages.
    :param package_types: Acceptable package types for filter (tuple).
    :param pipeline_uuid: Acceptable pipeline UUID for filter (str).
    :param filter_replicas: Option to filter out replicas (bool).

    :returns: Filtered package list.
    """
    if pipeline_uuid is None:
        pipeline_uuid = am.get_dashboard_uuid()
    origin_pipeline = "/api/v2/pipeline/{}/".format(pipeline_uuid)

    if filter_replicas:
        return [
            package for package in package_list
            if package["status"] in statuses and package["package_type"] in
            package_types and package["origin_pipeline"] == origin_pipeline
            and package["replicated_package"] is None
        ]
    return [
        package for package in package_list
        if package["status"] in statuses and package["package_type"] in
        package_types and package["origin_pipeline"] == origin_pipeline
    ]
Пример #2
0
 def add_arguments(self, parser):
     """Entry point to add custom arguments."""
     parser.add_argument(
         "-d",
         "--delete",
         action="store_true",
         help="Delete AIP-related Elasticsearch data for AIPs with matching"
         " UUIDS before indexing AIP data",
     )
     parser.add_argument(
         "--delete-all",
         action="store_true",
         help="Delete all AIP information in the index before starting.",
     )
     parser.add_argument(
         "-u",
         "--uuid",
         action="store",
         default="",
         help="Specify a single AIP by UUID to process",
     )
     parser.add_argument(
         "--pipeline",
         help="Pipeline UUID to use when filtering packages",
         default=am.get_dashboard_uuid(),
     )
def index_aip(client,
              uuid,
              name,
              filePath,
              pathToMETS,
              size=None,
              aips_in_aic=None,
              identifiers=[]):
    tree = ElementTree.parse(pathToMETS)

    # TODO add a conditional to toggle this
    remove_tool_output_from_mets(tree)

    root = tree.getroot()
    # Extract AIC identifier, other specially-indexed information
    aic_identifier = None
    is_part_of = None
    dublincore = root.find(
        'mets:dmdSec/mets:mdWrap/mets:xmlData/dcterms:dublincore',
        namespaces=ns.NSMAP)
    if dublincore is not None:
        aip_type = dublincore.findtext(
            'dc:type', namespaces=ns.NSMAP) or dublincore.findtext(
                'dcterms:type', namespaces=ns.NSMAP)
        if aip_type == "Archival Information Collection":
            aic_identifier = dublincore.findtext(
                'dc:identifier', namespaces=ns.NSMAP) or dublincore.findtext(
                    'dcterms:identifier', namespaces=ns.NSMAP)
        is_part_of = dublincore.findtext('dcterms:isPartOf',
                                         namespaces=ns.NSMAP)

    # convert METS XML to dict
    xml = ElementTree.tostring(root)
    mets_data = rename_dict_keys_with_child_dicts(
        normalize_dict_values(xmltodict.parse(xml)))

    aipData = {
        'uuid': uuid,
        'name': name,
        'filePath': filePath,
        'size': (size or os.path.getsize(filePath)) / 1024 / 1024,
        'mets': mets_data,
        'origin': get_dashboard_uuid(),
        'created': os.path.getmtime(pathToMETS),
        'AICID': aic_identifier,
        'isPartOf': is_part_of,
        'countAIPsinAIC': aips_in_aic,
        'identifiers': identifiers,
        'transferMetadata': _extract_transfer_metadata(root),
    }
    wait_for_cluster_yellow_status(client)
    try_to_index(client, aipData, 'aips', 'aip')
Пример #4
0
def index_aip(client, uuid, name, filePath, pathToMETS, size=None, aips_in_aic=None, identifiers=[], encrypted=False):

    tree = ElementTree.parse(pathToMETS)

    # TODO add a conditional to toggle this
    remove_tool_output_from_mets(tree)

    root = tree.getroot()
    # Extract AIC identifier, other specially-indexed information
    aic_identifier = None
    is_part_of = None
    dublincore = root.find('mets:dmdSec/mets:mdWrap/mets:xmlData/dcterms:dublincore', namespaces=ns.NSMAP)
    if dublincore is not None:
        aip_type = dublincore.findtext('dc:type', namespaces=ns.NSMAP) or dublincore.findtext('dcterms:type', namespaces=ns.NSMAP)
        if aip_type == "Archival Information Collection":
            aic_identifier = dublincore.findtext('dc:identifier', namespaces=ns.NSMAP) or dublincore.findtext('dcterms:identifier', namespaces=ns.NSMAP)
        is_part_of = dublincore.findtext('dcterms:isPartOf', namespaces=ns.NSMAP)

    # convert METS XML to dict
    xml = ElementTree.tostring(root)
    mets_data = rename_dict_keys_with_child_dicts(normalize_dict_values(xmltodict.parse(xml)))

    # Pull the create time from the METS header
    mets_hdr = root.find("mets:metsHdr", namespaces=ns.NSMAP)
    mets_created_attr = mets_hdr.get('CREATEDATE')

    created = time.time()

    if mets_created_attr:
        try:
            created = calendar.timegm(time.strptime(mets_created_attr, '%Y-%m-%dT%H:%M:%S'))
        except ValueError:
            print("Failed to parse METS CREATEDATE: %s" % (mets_created_attr))

    aipData = {
        'uuid': uuid,
        'name': name,
        'filePath': filePath,
        'size': (size or os.path.getsize(filePath)) / 1024 / 1024,
        'mets': mets_data,
        'origin': get_dashboard_uuid(),
        'created': created,
        'AICID': aic_identifier,
        'isPartOf': is_part_of,
        'countAIPsinAIC': aips_in_aic,
        'identifiers': identifiers,
        'transferMetadata': _extract_transfer_metadata(root),
        'encrypted': encrypted
    }
    wait_for_cluster_yellow_status(client)
    try_to_index(client, aipData, 'aips', 'aip')
 def add_arguments(self, parser):
     """Entry point to add custom arguments."""
     parser.add_argument("--transfer-backlog-dir",
                         default=self.DEFAULT_TRANSFER_BACKLOG_DIR)
     parser.add_argument("--no-prompt", action="store_true")
     parser.add_argument(
         "--from-storage-service",
         help="Import packages from Storage Service",
         action="store_true",
     )
     parser.add_argument(
         "--pipeline",
         help=
         "Pipeline UUID to use when filtering packages from Storage Service",
         default=am.get_dashboard_uuid(),
     )
Пример #6
0
def write_mets(mets_path, transfer_dir_path, base_path_placeholder,
               transfer_uuid):
    """
    Writes a METS XML file to disk, containing all the data we can find.

    Args:
        mets_path: Output path for METS XML output
        transfer_dir_path: Location of the files on disk
        base_path_placeholder: The placeholder string for the base path, e.g. 'transferDirectory'
        identifier_group: The database column used to lookup file UUIDs, e.g. 'transfer_id'
        transfer_uuid: The UUID for the transfer
    """
    transfer_dir_path = os.path.expanduser(transfer_dir_path)
    transfer_dir_path = os.path.normpath(transfer_dir_path)

    db_base_path = r"%{}%".format(base_path_placeholder)

    mets = metsrw.METSDocument()
    mets.objid = str(transfer_uuid)

    dashboard_uuid = get_dashboard_uuid()
    if dashboard_uuid:
        agent = metsrw.Agent(
            "CREATOR",
            type="SOFTWARE",
            name=str(dashboard_uuid),
            notes=["Archivematica dashboard UUID"],
        )
        mets.agents.append(agent)

    try:
        transfer = Transfer.objects.get(uuid=transfer_uuid)
    except Transfer.DoesNotExist:
        logger.info("No record in database for transfer: %s", transfer_uuid)
        raise

    if transfer.accessionid:
        alt_record_id = metsrw.AltRecordID(transfer.accessionid,
                                           type="Accession ID")
        mets.alternate_ids.append(alt_record_id)

    fsentry_tree = FSEntriesTree(transfer_dir_path, db_base_path, transfer)
    fsentry_tree.scan()

    mets.append_file(fsentry_tree.root_node)
    mets.write(mets_path, pretty_print=True)
Пример #7
0
def createMetsHdr(sip_uuid):
    header = etree.Element(ns.metsBNS + "metsHdr",
                           CREATEDATE=getUTCDate().strftime("%Y-%m-%dT%H:%M:%S"))
    agent = etree.SubElement(header, ns.metsBNS + "agent",
                             ROLE="CREATOR",
                             TYPE="OTHER",
                             OTHERTYPE="SOFTWARE")
    name = etree.SubElement(agent, ns.metsBNS + "name")
    name.text = get_dashboard_uuid()
    note = etree.SubElement(agent, ns.metsBNS + "note")
    note.text = "Archivematica dashboard UUID"

    accession_number = getAccessionNumberFromTransfer(sip_uuid)
    if accession_number:
        alt_id = etree.SubElement(header, ns.metsBNS + "altRecordID",
                                  TYPE="Accession number")
        alt_id.text = accession_number

    return header
Пример #8
0
def index_transfer_files(client,
                         uuid,
                         pathToTransfer,
                         index,
                         type_,
                         status=''):
    """
    Indexes files in the Transfer with UUID `uuid` at path `pathToTransfer`.

    Returns the number of files indexed.

    client: ElasticSearch client
    uuid: UUID of the Transfer in the DB
    pathToTransfer: path on disk, including the transfer directory and a
        trailing / but not including objects/
    index, type: index and type in ElasticSearch
    """
    files_indexed = 0
    ingest_date = str(datetime.datetime.today())[0:10]

    # Some files should not be indexed
    # This should match the basename of the file
    ignore_files = [
        'processingMCP.xml',
    ]

    # Get accessionId and name from Transfers table using UUID
    try:
        transfer = Transfer.objects.get(uuid=uuid)
        accession_id = transfer.accessionid
        transfer_name = transfer.currentlocation.split('/')[-2]
    except Transfer.DoesNotExist:
        accession_id = transfer_name = ''

    # Get dashboard UUID
    dashboard_uuid = get_dashboard_uuid()

    for filepath in list_files_in_dir(pathToTransfer):
        if os.path.isfile(filepath):
            # Get file UUID
            file_uuid = ''
            relative_path = filepath.replace(pathToTransfer,
                                             '%transferDirectory%')
            try:
                f = File.objects.get(currentlocation=relative_path,
                                     transfer_id=uuid)
                file_uuid = f.uuid
                formats = _get_file_formats(f)
                bulk_extractor_reports = _list_bulk_extractor_reports(
                    pathToTransfer, file_uuid)
                modification_date = f.modificationtime.strftime('%Y-%m-%d')
            except File.DoesNotExist:
                file_uuid = ''
                formats = []
                bulk_extractor_reports = []
                modification_date = ''

            # Get file path info
            relative_path = relative_path.replace('%transferDirectory%',
                                                  transfer_name + '/')
            file_extension = os.path.splitext(filepath)[1][1:].lower()
            filename = os.path.basename(filepath)
            # Size in megabytes
            size = os.path.getsize(filepath) / (1024 * 1024)
            create_time = os.stat(filepath).st_ctime

            if filename not in ignore_files:
                print('Indexing {} (UUID: {})'.format(relative_path,
                                                      file_uuid))

                # TODO Index Backlog Location UUID?
                indexData = {
                    'filename': filename,
                    'relative_path': relative_path,
                    'fileuuid': file_uuid,
                    'sipuuid': uuid,
                    'accessionid': accession_id,
                    'status': status,
                    'origin': dashboard_uuid,
                    'ingestdate': ingest_date,
                    'created': create_time,
                    'modification_date': modification_date,
                    'size': size,
                    'tags': [],
                    'file_extension': file_extension,
                    'bulk_extractor_reports': bulk_extractor_reports,
                    'format': formats,
                }

                wait_for_cluster_yellow_status(client)
                try_to_index(client, indexData, index, type_)

                files_indexed = files_indexed + 1
            else:
                print('Skipping indexing {}'.format(relative_path))

    if files_indexed > 0:
        client.indices.refresh()

    return files_indexed
Пример #9
0
def index_mets_file_metadata(client,
                             uuid,
                             metsFilePath,
                             index,
                             type_,
                             sipName,
                             identifiers=[]):
    # parse XML
    tree = ElementTree.parse(metsFilePath)
    root = tree.getroot()

    # TODO add a conditional to toggle this
    remove_tool_output_from_mets(tree)

    # get SIP-wide dmdSec
    dmdSec = root.findall("mets:dmdSec/mets:mdWrap/mets:xmlData",
                          namespaces=ns.NSMAP)
    dmdSecData = {}
    for item in dmdSec:
        xml = ElementTree.tostring(item)
        dmdSecData = xmltodict.parse(xml)

    # Extract isPartOf (for AIPs) or identifier (for AICs) from DublinCore
    dublincore = root.find(
        'mets:dmdSec/mets:mdWrap/mets:xmlData/dcterms:dublincore',
        namespaces=ns.NSMAP)
    aic_identifier = None
    is_part_of = None
    if dublincore is not None:
        aip_type = dublincore.findtext(
            'dc:type', namespaces=ns.NSMAP) or dublincore.findtext(
                'dcterms:type', namespaces=ns.NSMAP)
        if aip_type == "Archival Information Collection":
            aic_identifier = dublincore.findtext(
                'dc:identifier', namespaces=ns.NSMAP) or dublincore.findtext(
                    'dcterms:identifier', namespaces=ns.NSMAP)
        elif aip_type == "Archival Information Package":
            is_part_of = dublincore.findtext('dcterms:isPartOf',
                                             namespaces=ns.NSMAP)

    # establish structure to be indexed for each file item
    fileData = {
        'archivematicaVersion': version.get_version(),
        'AIPUUID': uuid,
        'sipName': sipName,
        'FILEUUID': '',
        'indexedAt': time.time(),
        'filePath': '',
        'fileExtension': '',
        'isPartOf': is_part_of,
        'AICID': aic_identifier,
        'METS': {
            'dmdSec':
            rename_dict_keys_with_child_dicts(
                normalize_dict_values(dmdSecData)),
            'amdSec': {},
        },
        'origin': get_dashboard_uuid(),
        'identifiers': identifiers,
        'transferMetadata': _extract_transfer_metadata(root),
    }

    # Index all files in a fileGrup with USE='original' or USE='metadata'
    original_files = root.findall(
        "mets:fileSec/mets:fileGrp[@USE='original']/mets:file",
        namespaces=ns.NSMAP)
    metadata_files = root.findall(
        "mets:fileSec/mets:fileGrp[@USE='metadata']/mets:file",
        namespaces=ns.NSMAP)
    files = original_files + metadata_files

    # Index AIC METS file if it exists
    for file_ in files:
        indexData = fileData.copy()  # Deep copy of dict, not of dict contents

        # Get file UUID.  If and ADMID exists, look in the amdSec for the UUID,
        # otherwise parse it out of the file ID.
        # 'Original' files have ADMIDs, 'Metadata' files don't
        admID = file_.attrib.get('ADMID', None)
        if admID is None:
            # Parse UUID from file ID
            fileUUID = None
            uuix_regex = r'\w{8}-?\w{4}-?\w{4}-?\w{4}-?\w{12}'
            uuids = re.findall(uuix_regex, file_.attrib['ID'])
            # Multiple UUIDs may be returned - if they are all identical, use that
            # UUID, otherwise use None.
            # To determine all UUIDs are identical, use the size of the set
            if len(set(uuids)) == 1:
                fileUUID = uuids[0]
        else:
            amdSecInfo = root.find("mets:amdSec[@ID='{}']".format(admID),
                                   namespaces=ns.NSMAP)
            fileUUID = amdSecInfo.findtext(
                "mets:techMD/mets:mdWrap/mets:xmlData/premis:object/premis:objectIdentifier/premis:objectIdentifierValue",
                namespaces=ns.NSMAP)

            # Index amdSec information
            xml = ElementTree.tostring(amdSecInfo)
            indexData['METS']['amdSec'] = rename_dict_keys_with_child_dicts(
                normalize_dict_values(xmltodict.parse(xml)))

        indexData['FILEUUID'] = fileUUID

        # Get file path from FLocat and extension
        filePath = file_.find(
            'mets:FLocat',
            namespaces=ns.NSMAP).attrib['{http://www.w3.org/1999/xlink}href']
        indexData['filePath'] = filePath
        _, fileExtension = os.path.splitext(filePath)
        if fileExtension:
            indexData['fileExtension'] = fileExtension[1:].lower()

        # index data
        wait_for_cluster_yellow_status(client)
        try_to_index(client, indexData, index, type_)

        # Reset fileData['METS']['amdSec'], since it is updated in the loop
        # above. See http://stackoverflow.com/a/3975388 for explanation
        fileData['METS']['amdSec'] = {}

    print('Indexed AIP files and corresponding METS XML.')

    return len(files)
Пример #10
0
def _index_aip_files(client,
                     uuid,
                     mets_path,
                     name,
                     identifiers=[],
                     printfn=print):
    """Index AIP files from AIP with UUID `uuid` and METS at path `mets_path`.

    :param client: The ElasticSearch client.
    :param uuid: The UUID of the AIP we're indexing.
    :param mets_path: path on disk where the AIP's METS file is located.
    :param name: AIP name.
    :param identifiers: optional additional identifiers (MODS, Islandora, etc.).
    :param printfn: optional print funtion.
    :return: number of files indexed.
    """
    # Parse XML
    tree = ElementTree.parse(mets_path)
    root = tree.getroot()

    # TODO: Add a conditional to toggle this
    _remove_tool_output_from_mets(tree)

    # Extract isPartOf (for AIPs) or identifier (for AICs) from DublinCore
    dublincore = root.find(
        'mets:dmdSec/mets:mdWrap/mets:xmlData/dcterms:dublincore',
        namespaces=ns.NSMAP)
    aic_identifier = None
    is_part_of = None
    if dublincore is not None:
        aip_type = dublincore.findtext(
            'dc:type', namespaces=ns.NSMAP) or dublincore.findtext(
                'dcterms:type', namespaces=ns.NSMAP)
        if aip_type == "Archival Information Collection":
            aic_identifier = dublincore.findtext(
                'dc:identifier', namespaces=ns.NSMAP) or dublincore.findtext(
                    'dcterms:identifier', namespaces=ns.NSMAP)
        elif aip_type == "Archival Information Package":
            is_part_of = dublincore.findtext('dcterms:isPartOf',
                                             namespaces=ns.NSMAP)

    # Establish structure to be indexed for each file item
    fileData = {
        'archivematicaVersion': version.get_version(),
        'AIPUUID': uuid,
        'sipName': name,
        'FILEUUID': '',
        'indexedAt': time.time(),
        'filePath': '',
        'fileExtension': '',
        'isPartOf': is_part_of,
        'AICID': aic_identifier,
        'METS': {
            'dmdSec': {},
            'amdSec': {},
        },
        'origin': get_dashboard_uuid(),
        'identifiers': identifiers,
        'transferMetadata': _extract_transfer_metadata(root),
    }

    # Index all files in a fileGrup with USE='original' or USE='metadata'
    original_files = root.findall(
        "mets:fileSec/mets:fileGrp[@USE='original']/mets:file",
        namespaces=ns.NSMAP)
    metadata_files = root.findall(
        "mets:fileSec/mets:fileGrp[@USE='metadata']/mets:file",
        namespaces=ns.NSMAP)
    files = original_files + metadata_files

    # Index AIC METS file if it exists
    for file_ in files:
        indexData = fileData.copy()  # Deep copy of dict, not of dict contents

        # Get file UUID.  If and ADMID exists, look in the amdSec for the UUID,
        # otherwise parse it out of the file ID.
        # 'Original' files have ADMIDs, 'Metadata' files don't
        admID = file_.attrib.get('ADMID', None)
        if admID is None:
            # Parse UUID from file ID
            fileUUID = None
            uuix_regex = r'\w{8}-?\w{4}-?\w{4}-?\w{4}-?\w{12}'
            uuids = re.findall(uuix_regex, file_.attrib['ID'])
            # Multiple UUIDs may be returned - if they are all identical, use that
            # UUID, otherwise use None.
            # To determine all UUIDs are identical, use the size of the set
            if len(set(uuids)) == 1:
                fileUUID = uuids[0]
        else:
            amdSecInfo = root.find("mets:amdSec[@ID='{}']".format(admID),
                                   namespaces=ns.NSMAP)
            fileUUID = amdSecInfo.findtext(
                "mets:techMD/mets:mdWrap/mets:xmlData/premis:object/premis:objectIdentifier/premis:objectIdentifierValue",
                namespaces=ns.NSMAP)

            # Index amdSec information
            xml = ElementTree.tostring(amdSecInfo)
            indexData['METS']['amdSec'] = _rename_dict_keys_with_child_dicts(
                _normalize_dict_values(xmltodict.parse(xml)))

        # Get the parent division for the file pointer
        # by searching the physical structural map section (structMap)
        file_id = file_.attrib.get('ID', None)
        file_pointer_division = root.find(
            "mets:structMap[@TYPE='physical']//mets:fptr[@FILEID='{}']/..".
            format(file_id),
            namespaces=ns.NSMAP)
        if file_pointer_division is not None:
            # If the parent division has a DMDID attribute then index
            # its data from the descriptive metadata section (dmdSec)
            dmd_section_id = file_pointer_division.attrib.get('DMDID', None)
            if dmd_section_id is not None:
                # dmd_section_id can contain one id (e.g., "dmdSec_2")
                # or more than one (e.g., "dmdSec_2 dmdSec_3",
                # when a file has both DC and non-DC metadata).
                # Attempt to index only the DC dmdSec if available
                for dmd_section_id_item in dmd_section_id.split():
                    dmd_section_info = root.find(
                        "mets:dmdSec[@ID='{}']/mets:mdWrap[@MDTYPE='DC']/mets:xmlData"
                        .format(dmd_section_id_item),
                        namespaces=ns.NSMAP)
                    if dmd_section_info is not None:
                        xml = ElementTree.tostring(dmd_section_info)
                        data = _rename_dict_keys_with_child_dicts(
                            _normalize_dict_values(xmltodict.parse(xml)))
                        indexData["METS"]["dmdSec"] = data
                        break

        indexData['FILEUUID'] = fileUUID

        # Get file path from FLocat and extension
        filePath = file_.find(
            'mets:FLocat',
            namespaces=ns.NSMAP).attrib['{http://www.w3.org/1999/xlink}href']
        indexData['filePath'] = filePath
        _, fileExtension = os.path.splitext(filePath)
        if fileExtension:
            indexData['fileExtension'] = fileExtension[1:].lower()

        # Index data
        _wait_for_cluster_yellow_status(client)
        _try_to_index(client, indexData, 'aipfiles', printfn=printfn)

        # Reset fileData['METS']['amdSec'] and fileData['METS']['dmdSec'],
        # since they are updated in the loop above.
        # See http://stackoverflow.com/a/3975388 for explanation
        fileData['METS']['amdSec'] = {}
        fileData['METS']['dmdSec'] = {}

    return len(files)
Пример #11
0
def index_aip_and_files(client,
                        uuid,
                        aip_stored_path,
                        mets_staging_path,
                        name,
                        aip_size,
                        aips_in_aic=None,
                        identifiers=[],
                        encrypted=False,
                        printfn=print):
    """Index AIP and AIP files with UUID `uuid` at path `path`.

    :param client: The ElasticSearch client.
    :param uuid: The UUID of the AIP we're indexing.
    :param aip_stored_path: path on disk where the AIP is located.
    :param mets_staging_path: path on disk where the AIP METS file is located.
    :param name: AIP name.
    :param aip_size: AIP size.
    :param aips_in_aic: optional number of AIPs stored in AIC.
    :param identifiers: optional additional identifiers (MODS, Islandora, etc.).
    :param identifiers: optional AIP encrypted boolean (defaults to `False`).
    :param printfn: optional print funtion.
    :return: 0 is succeded, 1 otherwise.
    """
    # Stop if AIP or METS file don't not exist.
    error_message = None
    if not os.path.exists(mets_staging_path):
        error_message = 'METS file does not exist at: ' + mets_staging_path
    if error_message:
        logger.error(error_message)
        printfn(error_message, file=sys.stderr)
        return 1
    printfn('AIP UUID: ' + uuid)
    printfn('Indexing AIP ...')
    tree = ElementTree.parse(mets_staging_path)
    _remove_tool_output_from_mets(tree)
    root = tree.getroot()
    # Extract AIC identifier, other specially-indexed information
    aic_identifier = None
    is_part_of = None
    dublincore = root.find(
        'mets:dmdSec/mets:mdWrap/mets:xmlData/dcterms:dublincore',
        namespaces=ns.NSMAP)
    if dublincore is not None:
        aip_type = dublincore.findtext(
            'dc:type', namespaces=ns.NSMAP) or dublincore.findtext(
                'dcterms:type', namespaces=ns.NSMAP)
        if aip_type == 'Archival Information Collection':
            aic_identifier = dublincore.findtext(
                'dc:identifier', namespaces=ns.NSMAP) or dublincore.findtext(
                    'dcterms:identifier', namespaces=ns.NSMAP)
        is_part_of = dublincore.findtext('dcterms:isPartOf',
                                         namespaces=ns.NSMAP)
    # Convert METS XML to dict
    xml = ElementTree.tostring(root)
    mets_data = _rename_dict_keys_with_child_dicts(
        _normalize_dict_values(xmltodict.parse(xml)))
    # Pull the create time from the METS header
    mets_hdr = root.find('mets:metsHdr', namespaces=ns.NSMAP)
    mets_created_attr = mets_hdr.get('CREATEDATE')
    created = time.time()
    if mets_created_attr:
        try:
            created = calendar.timegm(
                time.strptime(mets_created_attr, '%Y-%m-%dT%H:%M:%S'))
        except ValueError:
            printfn('Failed to parse METS CREATEDATE: %s' %
                    (mets_created_attr))
    aip_data = {
        'uuid': uuid,
        'name': name,
        'filePath': aip_stored_path,
        'size': aip_size / (1024 * 1024),
        'mets': mets_data,
        'origin': get_dashboard_uuid(),
        'created': created,
        'AICID': aic_identifier,
        'isPartOf': is_part_of,
        'countAIPsinAIC': aips_in_aic,
        'identifiers': identifiers,
        'transferMetadata': _extract_transfer_metadata(root),
        'encrypted': encrypted
    }
    _wait_for_cluster_yellow_status(client)
    _try_to_index(client, aip_data, 'aips', printfn=printfn)
    printfn('Done.')
    printfn('Indexing AIP files ...')
    files_indexed = _index_aip_files(
        client=client,
        uuid=uuid,
        mets_path=mets_staging_path,
        name=name,
        identifiers=identifiers,
        printfn=printfn,
    )
    printfn('Files indexed: ' + str(files_indexed))
    return 0
Пример #12
0
def _index_transfer_files(client,
                          uuid,
                          path,
                          ingest_date,
                          status="",
                          printfn=print):
    """Indexes files in the Transfer with UUID `uuid` at path `path`.

    :param client: ElasticSearch client.
    :param uuid: UUID of the Transfer in the DB.
    :param path: path on disk, including the transfer directory and a
                 trailing / but not including objects/.
    :param status: optional Transfer status.
    :param printfn: optional print funtion.
    :return: number of files indexed.
    """
    files_indexed = 0

    # Some files should not be indexed.
    # This should match the basename of the file.
    ignore_files = ["processingMCP.xml"]

    # Get accessionId and name from Transfers table using UUID
    try:
        transfer = Transfer.objects.get(uuid=uuid)
        accession_id = transfer.accessionid
        transfer_name = transfer.currentlocation.split("/")[-2]
    except Transfer.DoesNotExist:
        accession_id = transfer_name = ""

    # Get dashboard UUID
    dashboard_uuid = get_dashboard_uuid()

    for filepath in _list_files_in_dir(path):
        if os.path.isfile(filepath):
            # We need to account for the possibility of dealing with a BagIt
            # transfer package - the new default in Archivematica.
            # The BagIt is created when the package is sent to backlog hence
            # the locations in the database do not reflect the BagIt paths.
            # Strip the "data/" part when looking up the file entry.
            currentlocation = "%transferDirectory%" + os.path.relpath(
                filepath, path).lstrip("data/")
            try:
                f = File.objects.get(currentlocation=currentlocation,
                                     transfer_id=uuid)
                file_uuid = f.uuid
                formats = _get_file_formats(f)
                bulk_extractor_reports = _list_bulk_extractor_reports(
                    path, file_uuid)
                if f.modificationtime is not None:
                    modification_date = f.modificationtime.strftime("%Y-%m-%d")
            except File.DoesNotExist:
                file_uuid, modification_date = "", ""
                formats = []
                bulk_extractor_reports = []

            # Get file path info
            relative_path = filepath.replace(path, transfer_name + "/")
            file_extension = os.path.splitext(filepath)[1][1:].lower()
            filename = os.path.basename(filepath)
            # Size in megabytes
            size = os.path.getsize(filepath) / (1024 * 1024)
            create_time = os.stat(filepath).st_ctime

            if filename not in ignore_files:
                printfn("Indexing {} (UUID: {})".format(
                    relative_path, file_uuid))

                # TODO: Index Backlog Location UUID?
                indexData = {
                    "filename": filename,
                    "relative_path": relative_path,
                    "fileuuid": file_uuid,
                    "sipuuid": uuid,
                    "accessionid": accession_id,
                    "status": status,
                    "origin": dashboard_uuid,
                    "ingestdate": ingest_date,
                    "created": create_time,
                    "modification_date": modification_date,
                    "size": size,
                    "tags": [],
                    "file_extension": file_extension,
                    "bulk_extractor_reports": bulk_extractor_reports,
                    "format": formats,
                }

                _wait_for_cluster_yellow_status(client)
                _try_to_index(client,
                              indexData,
                              "transferfiles",
                              printfn=printfn)

                files_indexed = files_indexed + 1
            else:
                printfn("Skipping indexing {}".format(relative_path))

    return files_indexed
Пример #13
0
def index_aip_and_files(
    client,
    uuid,
    aip_stored_path,
    mets_staging_path,
    name,
    aip_size,
    aips_in_aic=None,
    identifiers=[],
    encrypted=False,
    printfn=print,
):
    """Index AIP and AIP files with UUID `uuid` at path `path`.

    :param client: The ElasticSearch client.
    :param uuid: The UUID of the AIP we're indexing.
    :param aip_stored_path: path on disk where the AIP is located.
    :param mets_staging_path: path on disk where the AIP METS file is located.
    :param name: AIP name.
    :param aip_size: AIP size.
    :param aips_in_aic: optional number of AIPs stored in AIC.
    :param identifiers: optional additional identifiers (MODS, Islandora, etc.).
    :param identifiers: optional AIP encrypted boolean (defaults to `False`).
    :param printfn: optional print funtion.
    :return: 0 is succeded, 1 otherwise.
    """
    # Stop if METS file is not at staging path.
    error_message = None
    if not os.path.exists(mets_staging_path):
        error_message = "METS file does not exist at: " + mets_staging_path
    if error_message:
        logger.error(error_message)
        printfn(error_message, file=sys.stderr)
        return 1
    printfn("AIP UUID: " + uuid)
    printfn("Indexing AIP ...")
    tree = ElementTree.parse(mets_staging_path)
    _remove_tool_output_from_mets(tree)
    root = tree.getroot()
    # Extract AIC identifier, other specially-indexed information
    aic_identifier = None
    is_part_of = None
    dublincore = ns.xml_find_premis(
        root, "mets:dmdSec/mets:mdWrap/mets:xmlData/dcterms:dublincore")
    if dublincore is not None:
        aip_type = ns.xml_findtext_premis(dublincore,
                                          "dc:type") or ns.xml_findtext_premis(
                                              dublincore, "dcterms:type")
        if aip_type == "Archival Information Collection":
            aic_identifier = ns.xml_findtext_premis(
                dublincore, "dc:identifier") or ns.xml_findtext_premis(
                    dublincore, "dcterms:identifier")
        is_part_of = ns.xml_findtext_premis(dublincore, "dcterms:isPartOf")

    # Convert METS XML to dict
    xml = ElementTree.tostring(root)
    mets_data = _rename_dict_keys_with_child_dicts(
        _normalize_dict_values(xmltodict.parse(xml)))

    # Pull the create time from the METS header.
    # Old METS did not use `metsHdr`.
    created = time.time()
    mets_hdr = ns.xml_find_premis(root, "mets:metsHdr")
    if mets_hdr is not None:
        mets_created_attr = mets_hdr.get("CREATEDATE")
        if mets_created_attr:
            try:
                created = calendar.timegm(
                    time.strptime(mets_created_attr, "%Y-%m-%dT%H:%M:%S"))
            except ValueError:
                printfn("Failed to parse METS CREATEDATE: %s" %
                        (mets_created_attr))

    aip_data = {
        "uuid": uuid,
        "name": name,
        "filePath": aip_stored_path,
        "size": aip_size / (1024 * 1024),
        "mets": mets_data,
        "origin": get_dashboard_uuid(),
        "created": created,
        "AICID": aic_identifier,
        "isPartOf": is_part_of,
        "countAIPsinAIC": aips_in_aic,
        "identifiers": identifiers,
        "transferMetadata": _extract_transfer_metadata(root),
        "encrypted": encrypted,
    }
    _wait_for_cluster_yellow_status(client)
    _try_to_index(client, aip_data, "aips", printfn=printfn)
    printfn("Done.")
    printfn("Indexing AIP files ...")
    files_indexed = _index_aip_files(
        client=client,
        uuid=uuid,
        mets_path=mets_staging_path,
        name=name,
        identifiers=identifiers,
        printfn=printfn,
    )
    printfn("Files indexed: " + str(files_indexed))
    return 0