def populate_data_from_storage_service(self, es_client, pipeline_uuid): """Populate indices and/or database from Storage Service. :param es_client: Elasticsearch client. :param pipeline_uuid: UUID of origin pipeline for transfers to reindex. :returns: None """ transfers = storageService.get_file_info(package_type="transfer") filtered_transfers = storageService.filter_packages( transfers, pipeline_uuid=pipeline_uuid) processed = 0 for transfer in filtered_transfers: transfer_uuid = transfer["uuid"] temp_backlog_dir = tempfile.mkdtemp() try: local_package = storageService.download_package( transfer_uuid, temp_backlog_dir) except storageService.Error: self.error( "Transfer {} not indexed. Unable to download from Storage Service." .format(transfer_uuid)) continue # Transfers are downloaded as .tar files, so we extract files # before indexing. try: extract_package(local_package, temp_backlog_dir) except CalledProcessError as err: self.error( "Transfer {0} not indexed. File extraction from tar failed: {1}." .format(transfer_uuid, err)) continue local_package_without_extension = am.package_name_from_path( local_package) transfer_indexed = False for entry in scandir(temp_backlog_dir): if entry.is_dir( ) and entry.name == local_package_without_extension: transfer_path = entry.path self.info( "Importing transfer {} from temporarily downloaded copy." .format(transfer_uuid)) _import_self_describing_transfer( self, es_client, self.stdout, Path(transfer_path), transfer_uuid, transfer["size"], ) transfer_indexed = True shutil.rmtree(temp_backlog_dir) if transfer_indexed: processed += 1 else: self.error( "Transfer {} not indexed. Unable to find files extracted from tar." .format(transfer_uuid)) self.success("{} transfers indexed!".format(processed))
def dspace_handle_to_archivesspace(job, sip_uuid): """Fetch the DSpace handle from the Storage Service and send to ArchivesSpace.""" # Get association to ArchivesSpace if it exists try: digital_object = models.ArchivesSpaceDigitalObject.objects.get(sip_id=sip_uuid) except models.ArchivesSpaceDigitalObject.DoesNotExist: job.pyprint('SIP', sip_uuid, 'not associated with an ArchivesSpace component') return NO_ACTION job.pyprint('Digital Object', digital_object.remoteid, 'for SIP', digital_object.sip_id, 'found') logger.info('Digital Object %s for SIP %s found', digital_object.remoteid, digital_object.sip_id) # Get dspace handle from SS file_info = storage_service.get_file_info(uuid=sip_uuid)[0] try: handle = file_info['misc_attributes']['handle'] except KeyError: job.pyprint('AIP has no DSpace handle stored') return NO_ACTION job.pyprint('DSpace handle:', handle) logger.info('DSpace handle: %s', handle) # POST Dspace handle to ArchivesSpace # Get ArchivesSpace config config = models.DashboardSetting.objects.get_dict('upload-archivesspace_v0.0') archivesspace_url = config["base_url"] # Log in url = archivesspace_url + '/users/' + config['user'] + '/login' params = {'password': config['passwd']} logger.debug('Log in to ArchivesSpace URL: %s', url) response = requests.post(url, params=params, timeout=mcpclient_settings.AGENTARCHIVES_CLIENT_TIMEOUT) logger.debug('Response: %s %s', response, response.content) session_id = response.json()['session'] headers = {'X-ArchivesSpace-Session': session_id} # Get Digital Object from ArchivesSpace url = archivesspace_url + digital_object.remoteid logger.debug('Get Digital Object info URL: %s', url) response = requests.get(url, headers=headers, timeout=mcpclient_settings.AGENTARCHIVES_CLIENT_TIMEOUT) logger.debug('Response: %s %s', response, response.content) body = response.json() # Update url = archivesspace_url + digital_object.remoteid file_version = { "file_uri": handle, "use_statement": config['use_statement'], "xlink_show_attribute": config['xlink_show'], "xlink_actuate_attribute": config['xlink_actuate'], } body['file_versions'].append(file_version) logger.debug('Modified Digital Object: %s', body) response = requests.post(url, headers=headers, json=body, timeout=mcpclient_settings.AGENTARCHIVES_CLIENT_TIMEOUT) job.pyprint('Update response:', response, response.content) logger.debug('Response: %s %s', response, response.content) if response.status_code != 200: job.pyprint('Error updating', digital_object.remoteid) return ERROR return COMPLETED
def dspace_handle_to_archivesspace(sip_uuid): """Fetch the DSpace handle from the Storage Service and send to ArchivesSpace.""" # Get association to ArchivesSpace if it exists try: digital_object = models.ArchivesSpaceDigitalObject.objects.get(sip_id=sip_uuid) except models.ArchivesSpaceDigitalObject.DoesNotExist: print('SIP', sip_uuid, 'not associated with an ArchivesSpace component') return NO_ACTION print('Digital Object', digital_object.remoteid, 'for SIP', digital_object.sip_id, 'found') logger.info('Digital Object %s for SIP %s found', digital_object.remoteid, digital_object.sip_id) # Get dspace handle from SS file_info = storage_service.get_file_info(uuid=sip_uuid)[0] try: handle = file_info['misc_attributes']['handle'] except KeyError: print('AIP has no DSpace handle stored') return NO_ACTION print('DSpace handle:', handle) logger.info('DSpace handle: %s', handle) # POST Dspace handle to ArchivesSpace # Get ArchivesSpace config config = admin_models.ArchivesSpaceConfig.objects.all()[0] archivesspace_url = 'http://' + config.host + ':' + str(config.port) # Log in url = archivesspace_url + '/users/' + config.user + '/login' params = {'password': config.passwd} logger.debug('Log in to ArchivesSpace URL: %s', url) response = requests.post(url, params=params) logger.debug('Response: %s %s', response, response.content) session_id = response.json()['session'] headers = {'X-ArchivesSpace-Session': session_id} # Get Digital Object from ArchivesSpace url = archivesspace_url + digital_object.remoteid logger.debug('Get Digital Object info URL: %s', url) response = requests.get(url, headers=headers) logger.debug('Response: %s %s', response, response.content) body = response.json() # Update url = archivesspace_url + digital_object.remoteid file_version = { "file_uri": handle, "use_statement": config.use_statement, "xlink_show_attribute": config.xlink_show, "xlink_actuate_attribute": config.xlink_actuate, } body['file_versions'].append(file_version) logger.debug('Modified Digital Object: %s', body) response = requests.post(url, headers=headers, json=body) print('Update response:', response, response.content) logger.debug('Response: %s %s', response, response.content) if response.status_code != 200: print('Error updating', digital_object.remoteid) return ERROR return COMPLETED
def index_aip(job): """Write AIP information to ElasticSearch. """ sip_uuid = job.args[1] # %SIPUUID% sip_name = job.args[2] # %SIPName% sip_staging_path = job.args[3] # %SIPDirectory% sip_type = job.args[4] # %SIPType% if "aips" not in mcpclient_settings.SEARCH_ENABLED: logger.info("Skipping indexing: AIPs indexing is currently disabled.") return 0 elasticSearchFunctions.setup_reading_from_conf(mcpclient_settings) client = elasticSearchFunctions.get_client() aip_info = storage_service.get_file_info(uuid=sip_uuid) job.pyprint("AIP info:", aip_info) aip_info = aip_info[0] mets_staging_path = os.path.join(sip_staging_path, "METS.{}.xml".format(sip_uuid)) identifiers = get_identifiers(job, sip_staging_path) # If this is an AIC, find the number of AIP stored in it and index that aips_in_aic = None if sip_type == "AIC": try: uv = UnitVariable.objects.get(unittype="SIP", unituuid=sip_uuid, variable="AIPsinAIC") aips_in_aic = uv.variablevalue except UnitVariable.DoesNotExist: pass # Delete ES index before creating new one if reingesting if "REIN" in sip_type: job.pyprint( "Deleting outdated entry for AIP and AIP files with UUID", sip_uuid, "from archival storage", ) elasticSearchFunctions.delete_aip(client, sip_uuid) elasticSearchFunctions.delete_aip_files(client, sip_uuid) job.pyprint("Indexing AIP and AIP files") # Even though we treat MODS identifiers as SIP-level, we need to index them # here because the archival storage tab actually searches on the # aips/aipfile index. ret = elasticSearchFunctions.index_aip_and_files( client=client, uuid=sip_uuid, aip_stored_path=aip_info["current_full_path"], mets_staging_path=mets_staging_path, name=sip_name, aip_size=aip_info["size"], aips_in_aic=aips_in_aic, identifiers=identifiers, encrypted=aip_info["encrypted"], printfn=job.pyprint, ) if ret == 1: job.pyprint("Error indexing AIP and AIP files", file=sys.stderr) return ret
def aips_pending_deletion(): aip_uuids = [] try: aips = storage_service.get_file_info(status='DEL_REQ') except Exception as e: # TODO this should be messages.warning, but we need 'request' here logger.warning("Error retrieving AIPs pending deletion: is the storage server running? Error: {}".format(e)) else: for aip in aips: aip_uuids.append(aip['uuid']) return aip_uuids
def check_and_remove_deleted_transfers(es_client): """ Check the storage service to see if transfers marked in ES as 'pending deletion' have been deleted yet. If so, remove the transfer and its files from ES. This is a bit of a kludge (that we do elsewhere e.g. in the storage tab), but it appears necessary as the storage service doesn't talk directly to ES. :return: None """ query = { 'query': { 'bool': { 'must': { 'match': { 'pending_deletion': True } } } } } deletion_pending_results = es_client.search(body=query, index='transfers', doc_type='transfer', fields='uuid,status') for hit in deletion_pending_results['hits']['hits']: transfer_uuid = hit['fields']['uuid'][0] api_results = storage_service.get_file_info(uuid=transfer_uuid) try: status = api_results[0]['status'] except IndexError: logger.info('Transfer not found in storage service: {}'.format( transfer_uuid)) continue if status == 'DELETED': elasticSearchFunctions.remove_backlog_transfer_files( es_client, transfer_uuid) elasticSearchFunctions.remove_backlog_transfer( es_client, transfer_uuid)
def check_and_remove_deleted_transfers(es_client): """ Check the storage service to see if transfers marked in ES as 'pending deletion' have been deleted yet. If so, remove the transfer and its files from ES. This is a bit of a kludge (that we do elsewhere e.g. in the storage tab), but it appears necessary as the storage service doesn't talk directly to ES. :return: None """ query = { "query": { "bool": { "must": { "match": { "pending_deletion": True } } } } } deletion_pending_results = es_client.search(body=query, index="transfers", _source="uuid,status") for hit in deletion_pending_results["hits"]["hits"]: transfer_uuid = hit["_source"]["uuid"] api_results = storage_service.get_file_info(uuid=transfer_uuid) try: status = api_results[0]["status"] except IndexError: logger.info("Transfer not found in storage service: {}".format( transfer_uuid)) continue if status == "DELETED": elasticSearchFunctions.remove_backlog_transfer_files( es_client, transfer_uuid) elasticSearchFunctions.remove_backlog_transfer( es_client, transfer_uuid)
def handle(self, *args, **options): # Ignore elasticsearch-py logging events unless they're errors. logging.getLogger("elasticsearch").setLevel(logging.ERROR) logging.getLogger("archivematica.common").setLevel(logging.ERROR) # Create temporary directory for downloaded METS files. temp_dir = tempfile.mkdtemp() pipeline_uuid = options["pipeline"] delete_all = options["delete_all"] delete_before_reindexing = False if options["delete"]: delete_before_reindexing = True if options["uuid"]: aips_to_index = storageService.get_file_info(uuid=options["uuid"]) # If we're indexing only one AIP, don't delete the indices. delete_all = False else: # For bulk operations, index all AIPs and AICs associated # with the pipeline that are not deleted or replicas. packages = storageService.get_file_info() aips_to_index = storageService.filter_packages( packages, package_types=PACKAGE_TYPES_TO_INDEX, pipeline_uuid=pipeline_uuid, filter_replicas=True, ) aips_to_index_count = len(aips_to_index) # If there's nothing to index, log error and quit. if not aips_to_index_count: self.error("No AIPs found to index. Quitting.") sys.exit(1) # Setup es_client and delete indices if required. es_client = setup_es_for_aip_reindexing(self, delete_all) self.info("Rebuilding 'aips' and 'aipfiles' indices") # Index packages. packages_not_indexed = [] aip_indexed_count = 0 for aip in aips_to_index: is_aic = False if aip["package_type"] == "AIC": is_aic = True index_success = self.process_package( es_client, aip, temp_dir, delete_before_reindexing, is_aic=is_aic ) if index_success: aip_indexed_count += 1 else: packages_not_indexed.append(aip["uuid"]) # Clean up and report on packages indexed. self.info("Cleaning up") shutil.rmtree(temp_dir) if packages_not_indexed: self.error( "Indexing complete. Indexed {count} of {total} AIPs/AICs. Packages not indexed: {uuids}.".format( count=aip_indexed_count, total=aips_to_index_count, uuids=", ".join(packages_not_indexed), ) ) else: pluralized_aips_aics_term = ( "AIP/AIC" if aip_indexed_count == 1 else "AIPs/AICs" ) self.success( "Indexing complete. Successfully indexed {count} {term}.".format( count=aip_indexed_count, term=pluralized_aips_aics_term ) )
def list_display(request): if 'aips' not in settings.SEARCH_ENABLED: return render(request, 'archival_storage/list.html') current_page_number = int(request.GET.get('page', 1)) logger.debug('Current page: %s', current_page_number) # get count of AIP files es_client = elasticSearchFunctions.get_client() aip_indexed_file_count = aip_file_count(es_client) # get AIPs order_by = request.GET.get('order_by', 'name_unanalyzed') sort_by = request.GET.get('sort_by', 'up') if sort_by == 'down': sort_direction = 'desc' else: sort_direction = 'asc' sort_specification = order_by + ':' + sort_direction sort_params = 'order_by=' + order_by + '&sort_by=' + sort_by # get list of UUIDs of AIPs that are deleted or pending deletion aips_deleted_or_pending_deletion = [] should_haves = [ {'match': {'status': 'DEL_REQ'}}, {'match': {'status': 'DELETED'}}, ] query = { "query": { "bool": { "should": should_haves } } } deleted_aip_results = es_client.search( body=query, index='aips', doc_type='aip', fields='uuid,status' ) for deleted_aip in deleted_aip_results['hits']['hits']: aips_deleted_or_pending_deletion.append(deleted_aip['fields']['uuid'][0]) # Fetch results and paginate def es_pager(page, page_size): """ Fetch one page of normalized entries from Elasticsearch. :param page: 1-indexed page to fetch :param page_size: Number of entries on a page :return: List of dicts for each entry, where keys and values have been cleaned up """ start = (page - 1) * page_size results = es_client.search( index='aips', doc_type='aip', body=elasticSearchFunctions.MATCH_ALL_QUERY, fields='origin,uuid,filePath,created,name,size,encrypted', sort=sort_specification, size=page_size, from_=start, ) # normalize results - each of the fields contains a single value, # but is returned from the ES API as a single-length array # e.g. {"fields": {"uuid": ["abcd"], "name": ["aip"] ...}} return [elasticSearchFunctions.normalize_results_dict(d) for d in results['hits']['hits']] items_per_page = 10 count = es_client.count(index='aips', doc_type='aip', body=elasticSearchFunctions.MATCH_ALL_QUERY)['count'] results = LazyPagedSequence(es_pager, page_size=items_per_page, length=count) # Paginate page = helpers.pager( results, items_per_page, current_page_number ) # process deletion, etc., and format results aips = [] for aip in page.object_list: # If an AIP was deleted or is pending deletion, react if status changed if aip['uuid'] in aips_deleted_or_pending_deletion: # check with storage server to see current status api_results = storage_service.get_file_info(uuid=aip['uuid']) try: aip_status = api_results[0]['status'] except IndexError: # Storage service does not know about this AIP # TODO what should happen here? logger.info("AIP not found in storage service: {}".format(aip)) continue # delete AIP metadata in ElasticSearch if AIP has been deleted from the # storage server # TODO: handle this asynchronously if aip_status == 'DELETED': elasticSearchFunctions.delete_aip(es_client, aip['uuid']) elasticSearchFunctions.delete_aip_files(es_client, aip['uuid']) elif aip_status != 'DEL_REQ': # update the status in ElasticSearch for this AIP elasticSearchFunctions.mark_aip_stored(es_client, aip['uuid']) else: aip_status = 'UPLOADED' # Tweak AIP presentation and add to display array if aip_status != 'DELETED': aip['status'] = AIP_STATUS_DESCRIPTIONS[aip_status] try: size = '{0:.2f} MB'.format(float(aip['size'])) except (TypeError, ValueError): size = 'Removed' aip['size'] = size aip['href'] = aip['filePath'].replace(AIPSTOREPATH + '/', "AIPsStore/") aip['date'] = aip['created'] aips.append(aip) total_size = total_size_of_aips(es_client) # Find out which AIPs are encrypted return render(request, 'archival_storage/list.html', { 'total_size': total_size, 'aip_indexed_file_count': aip_indexed_file_count, 'aips': aips, 'page': page, 'search_params': sort_params, } )
def store_aip(aip_destination_uri, aip_path, sip_uuid, sip_name, sip_type): """ Stores an AIP with the storage service. aip_destination_uri = storage service destination URI, should be of purpose AIP Store (AS) aip_path = Full absolute path to the AIP's current location on the local filesystem sip_uuid = UUID of the SIP, which will become the UUID of the AIP sip_name = SIP name. Not used directly, but part of the AIP name Example inputs: storeAIP.py "/api/v1/location/9c2b5bb7-abd6-477b-88e0-57107219dace/" "/var/archivematica/sharedDirectory/currentlyProcessing/ep6-0737708e-9b99-471a-b331-283e2244164f/ep6-0737708e-9b99-471a-b331-283e2244164f.7z" "0737708e-9b99-471a-b331-283e2244164f" "ep6" """ # FIXME Assume current Location is the one set up by default until location # is passed in properly, or use Agent to make sure is correct CP current_location = storage_service.get_location(purpose="CP")[0] # If ``aip_path`` does not exist, this may be a DIP that was not uploaded. # In that case, it will be in the uploadDIP/ directory instead of the # uploadedDIPs/ directory. if not os.path.exists(aip_path): aip_path = get_upload_dip_path(aip_path) # Make aip_path relative to the Location shared_path = os.path.join(current_location['path'], '') # Ensure ends with / relative_aip_path = aip_path.replace(shared_path, '') # Get the package type: AIC or AIP if 'SIP' in sip_type or 'AIP' in sip_type: # Also matches AIP-REIN package_type = "AIP" elif 'AIC' in sip_type: # Also matches AIC-REIN package_type = 'AIC' elif 'DIP' in sip_type: package_type = 'DIP' # Uncompressed directory AIPs must be terminated in a /, # otherwise the storage service will place the directory # inside another directory of the same name. current_path = os.path.basename(aip_path) if os.path.isdir(aip_path) and not aip_path.endswith('/'): relative_aip_path = relative_aip_path + '/' # DIPs cannot share the AIP UUID, as the storage service depends on # having a unique UUID; assign a new one before uploading. # TODO allow mapping the AIP UUID to the DIP UUID for retrieval. related_package_uuid = None if sip_type == 'DIP': uuid = str(uuid4()) print('Checking if DIP {} parent AIP has been created...'.format(uuid)) # Set related package UUID, so a relationship to the parent AIP can be # created if if AIP has been stored. If the AIP hasn't yet been stored # take note of the DIP's UUID so it the relationship can later be # created when the AIP is stored. try: storage_service.get_file_info(uuid=sip_uuid)[0] # Check existence related_package_uuid = sip_uuid print('Parent AIP exists so relationship can be created.') except IndexError: UnitVariable.objects.create(unittype='SIP', unituuid=sip_uuid, variable='relatedPackage', variablevalue=uuid) print( 'Noting DIP UUID {} related to AIP so relationship can be created when AIP is stored.' .format(uuid)) else: uuid = sip_uuid related_package = get_object_or_None(UnitVariable, unituuid=sip_uuid, variable='relatedPackage') related_package_uuid = related_package.variablevalue if related_package is not None else None # If AIP is a directory, calculate size recursively if os.path.isdir(aip_path): size = 0 for dirpath, _, filenames in os.walk(aip_path): for filename in filenames: file_path = os.path.join(dirpath, filename) size += os.path.getsize(file_path) else: size = os.path.getsize(aip_path) # Get the AIP subtype from any DC type attribute supplied by the user for # the AIP. If found, this will replace 'Archival Information Package' in # ``<mets:div TYPE='Archival Information Package'>`` in the pointer file. sip_metadata_uuid = '3e48343d-e2d2-4956-aaa3-b54d26eb9761' try: dc = DublinCore.objects.get(metadataappliestotype_id=sip_metadata_uuid, metadataappliestoidentifier=uuid) except DublinCore.DoesNotExist: aip_subtype = 'Archival Information Package' else: aip_subtype = dc.type # Store the AIP (new_file, error_msg) = storage_service.create_file( uuid=uuid, origin_location=current_location['resource_uri'], origin_path=relative_aip_path, current_location=aip_destination_uri, current_path=current_path, package_type=package_type, aip_subtype=aip_subtype, size=size, update='REIN' in sip_type, related_package_uuid=related_package_uuid, events=get_events_from_db(uuid), agents=get_agents_from_db(uuid)) if new_file is not None and new_file.get('status', '') != "FAIL": message = "Storage service created {}: {}".format(sip_type, new_file) LOGGER.info(message) print(message) sys.exit(0) else: print("{} creation failed. See Storage Service logs for more details". format(sip_type), file=sys.stderr) print(error_msg or "Package status: Failed", file=sys.stderr) LOGGER.warning( "{} unabled to be created: {}. See logs for more details.".format( sip_type, error_msg)) sys.exit(1)
def index_aip(): """ Write AIP information to ElasticSearch. """ sip_uuid = sys.argv[1] # %SIPUUID% sip_name = sys.argv[2] # %SIPName% sip_path = sys.argv[3] # %SIPDirectory% sip_type = sys.argv[4] # %SIPType% # Check if ElasticSearch is enabled client_config_path = '/etc/archivematica/MCPClient/clientConfig.conf' config = ConfigParser.SafeConfigParser() config.read(client_config_path) elastic_search_disabled = False try: elastic_search_disabled = config.getboolean( 'MCPClient', "disableElasticsearchIndexing") except ConfigParser.NoOptionError: pass if elastic_search_disabled: print('Skipping indexing: indexing is currently disabled in', client_config_path) return 0 print('SIP UUID:', sip_uuid) aip_info = storage_service.get_file_info(uuid=sip_uuid) print('AIP info:', aip_info) aip_info = aip_info[0] mets_name = 'METS.{}.xml'.format(sip_uuid) mets_path = os.path.join(sip_path, mets_name) mods_paths = list_mods(sip_path) identifiers = [] for mods in mods_paths: identifiers.extend(extract_identifiers_from_mods(mods)) # If this is an AIC, find the number of AIP stored in it and index that aips_in_aic = None if sip_type == "AIC": try: uv = UnitVariable.objects.get(unittype="SIP", unituuid=sip_uuid, variable="AIPsinAIC") aips_in_aic = uv.variablevalue except UnitVariable.DoesNotExist: pass print('Indexing AIP info') # Delete ES index before creating new one if reingesting if 'REIN' in sip_type: print('Deleting outdated entry for AIP and AIP files with UUID', sip_uuid, 'from archival storage') elasticSearchFunctions.delete_aip(sip_uuid) elasticSearchFunctions.connect_and_delete_aip_files(sip_uuid) # Index AIP elasticSearchFunctions.connect_and_index_aip( sip_uuid, sip_name, aip_info['current_full_path'], mets_path, size=aip_info['size'], aips_in_aic=aips_in_aic, identifiers=identifiers) # Index AIP files print('Indexing AIP files') # Even though we treat MODS identifiers as SIP-level, we need to index them # here because the archival storage tab actually searches on the # aips/aipfile index. exitCode = elasticSearchFunctions.connect_and_index_files( index='aips', type='aipfile', uuid=sip_uuid, pathToArchive=sip_path, identifiers=identifiers, sipName=sip_name, ) if exitCode == 1: print('Error indexing AIP files', file=sys.stderr) return 1 return 0
def index_aip(): """ Write AIP information to ElasticSearch. """ sip_uuid = sys.argv[1] # %SIPUUID% sip_name = sys.argv[2] # %SIPName% sip_path = sys.argv[3] # %SIPDirectory% sip_type = sys.argv[4] # %SIPType% if not mcpclient_settings.SEARCH_ENABLED: logger.info('Skipping indexing: indexing is currently disabled.') return 0 elasticSearchFunctions.setup_reading_from_conf(mcpclient_settings) client = elasticSearchFunctions.get_client() print('SIP UUID:', sip_uuid) aip_info = storage_service.get_file_info(uuid=sip_uuid) print('AIP info:', aip_info) aip_info = aip_info[0] mets_name = 'METS.{}.xml'.format(sip_uuid) mets_path = os.path.join(sip_path, mets_name) identifiers = get_identifiers(sip_path) # If this is an AIC, find the number of AIP stored in it and index that aips_in_aic = None if sip_type == "AIC": try: uv = UnitVariable.objects.get(unittype="SIP", unituuid=sip_uuid, variable="AIPsinAIC") aips_in_aic = uv.variablevalue except UnitVariable.DoesNotExist: pass print('Indexing AIP info') # Delete ES index before creating new one if reingesting if 'REIN' in sip_type: print('Deleting outdated entry for AIP and AIP files with UUID', sip_uuid, 'from archival storage') elasticSearchFunctions.delete_aip(client, sip_uuid) elasticSearchFunctions.delete_aip_files(client, sip_uuid) # Index AIP elasticSearchFunctions.index_aip(client, sip_uuid, sip_name, aip_info['current_full_path'], mets_path, size=aip_info['size'], aips_in_aic=aips_in_aic, identifiers=identifiers, encrypted=aip_info['encrypted']) # Index AIP files print('Indexing AIP files') # Even though we treat MODS identifiers as SIP-level, we need to index them # here because the archival storage tab actually searches on the # aips/aipfile index. exitCode = elasticSearchFunctions.index_files( client, index='aips', type_='aipfile', uuid=sip_uuid, pathToArchive=sip_path, identifiers=identifiers, sipName=sip_name, ) if exitCode == 1: print('Error indexing AIP files', file=sys.stderr) return 1 return 0
def index_from_aipstore(uuid): # check if uuid exists in the AIPstore file_info = storage_service.get_file_info(uuid=uuid) if len(file_info) != 1: print("Error: number of packages returned from aipstore: {}. Must be 1".format(len(file_info))) return -1 # check if package_type is "AIP"" print("file info: {}".format(file_info)) if file_info[0]['package_type'] != 'AIP': print("Error: package is not AIP: {}".format(file_info[0]['package_type'])) return -2 # get AIP file name from file info basename = os.path.basename(file_info[0]['current_path']) filename, file_extension = os.path.splitext(basename) # get aip download url aip_download_url = storage_service.download_file_url(file_uuid=uuid) print("AIP download URL: {}".format(aip_download_url)) # create a temp directory for processing tempdir = tempfile.mkdtemp(prefix='aiptmp', dir=TMP_DIR_BASE) print("Created: {}".format(tempdir)) # download file to temp directory urllib.urlretrieve(aip_download_url, os.path.join(tempdir, basename)) print("aip downloaded to directory") # expand aip files command_string = "atool --extract-to=. {}".format(basename) print ("will execute: {}".format(command_string)) p = subprocess.Popen(shlex.split(command_string), cwd=tempdir, stdout=subprocess.PIPE, stderr=subprocess.PIPE, ) output = p.communicate() if p.returncode == 0: print('Successfully extracted AIP') print('\n'.join(output)) else: print('Failed to extract AIP') print('\n'.join(output)) # delete downloaded file now that we have it expanded os.remove(os.path.join(tempdir, basename)) # get aip path to pass to the client script dirlist = glob.glob(os.path.join(tempdir, "*")) if (len(dirlist) != 1): print("Error: {} must have only one directory".format(tempdir)) return -4 if (not os.path.isdir(dirlist[0])): print("Error: {} must be a directory".format(dirlist[0])) return -4 # populate the 4 variables needed to call the aip index script sip_uuid=uuid sip_name=filename[:-37] # strip uuid and dashes sip_path = os.path.join(dirlist[0],"data") # METS etc inside the data/ directory of the AIP sip_type="REIN" # setting as reingest so that existing index entries are removed beforehand command_string = "./indexAIP.py {} {} {} {}".format(sip_uuid, sip_name, sip_path, sip_type) print ("will execute: {}".format(command_string)) p = subprocess.Popen(shlex.split(command_string), cwd="/usr/lib/archivematica/MCPClient/clientScripts", env={"DJANGO_SETTINGS_MODULE": "settings.common", "PYTHONPATH": "/usr/share/archivematica/dashboard:/usr/lib/archivematica/archivematicaCommon" }, stdout=subprocess.PIPE, stderr=subprocess.PIPE, ) output = p.communicate() if p.returncode == 0: print('Successfully indexed AIP {0}'.format(sip_uuid)) print('\n'.join(output)) else: print('Failed to index AIP {0}'.format(sip_uuid)) print('\n'.join(output)) # delete temporary processing directory shutil.rmtree(tempdir)
def processAIPThenDeleteMETSFile(path, temp_dir, es_client, delete_existing_data=False): archive_file = os.path.basename(path) # Regex match the UUID - AIP might end with .7z, .tar.bz2, or # something else. match = re.search( r"[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}", archive_file) if match is not None: aip_uuid = match.group() else: return -1 print("Processing AIP", aip_uuid) if delete_existing_data is True: print("Deleting AIP", aip_uuid, "from aips/aip and aips/aipfile.") elasticSearchFunctions.delete_aip(es_client, aip_uuid) elasticSearchFunctions.delete_aip_files(es_client, aip_uuid) # AIP filenames are <name>-<uuid><extension> # Index of match end is right before the extension subdir = archive_file[:match.end()] aip_name = subdir[:-37] mets_file = "METS." + aip_uuid + ".xml" mets_file_relative_path = os.path.join("data", mets_file) if os.path.isfile(path): mets_file_relative_path = os.path.join(subdir, mets_file_relative_path) path_to_mets = extract_file( archive_path=path, destination_dir=temp_dir, relative_path=mets_file_relative_path, ) # If AIC, need to extract number of AIPs in AIC to index as well aips_in_aic = None root = etree.parse(path_to_mets) try: aip_type = ns.xml_find_premis( root, "mets:dmdSec/mets:mdWrap/mets:xmlData/dcterms:dublincore/dcterms:type" ).text except AttributeError: pass else: if aip_type == "Archival Information Collection": aips_in_aic = get_aips_in_aic(root, path, temp_dir) aip_info = storage_service.get_file_info(uuid=aip_uuid) if not aip_info: print("Information not found in Storage Service for AIP UUID: ", aip_uuid) return 1 return elasticSearchFunctions.index_aip_and_files( client=es_client, uuid=aip_uuid, aip_stored_path=path, mets_staging_path=path_to_mets, name=aip_name, aip_size=aip_info[0]["size"], aips_in_aic=aips_in_aic, identifiers=[], # TODO get these )
def dspace_handle_to_archivesspace(job, sip_uuid): """Fetch the DSpace handle from the Storage Service and send to ArchivesSpace.""" # Get association to ArchivesSpace if it exists try: digital_object = models.ArchivesSpaceDigitalObject.objects.get( sip_id=sip_uuid) except models.ArchivesSpaceDigitalObject.DoesNotExist: job.pyprint("SIP", sip_uuid, "not associated with an ArchivesSpace component") return NO_ACTION job.pyprint( "Digital Object", digital_object.remoteid, "for SIP", digital_object.sip_id, "found", ) logger.info( "Digital Object %s for SIP %s found", digital_object.remoteid, digital_object.sip_id, ) # Get dspace handle from SS file_info = storage_service.get_file_info(uuid=sip_uuid)[0] try: handle = file_info["misc_attributes"]["handle"] except KeyError: job.pyprint("AIP has no DSpace handle stored") return NO_ACTION job.pyprint("DSpace handle:", handle) logger.info("DSpace handle: %s", handle) # POST Dspace handle to ArchivesSpace # Get ArchivesSpace config config = models.DashboardSetting.objects.get_dict( "upload-archivesspace_v0.0") archivesspace_url = config["base_url"] # Log in url = archivesspace_url + "/users/" + config["user"] + "/login" params = {"password": config["passwd"]} logger.debug("Log in to ArchivesSpace URL: %s", url) response = requests.post( url, params=params, timeout=mcpclient_settings.AGENTARCHIVES_CLIENT_TIMEOUT) logger.debug("Response: %s %s", response, response.content) session_id = response.json()["session"] headers = {"X-ArchivesSpace-Session": session_id} # Get Digital Object from ArchivesSpace url = archivesspace_url + digital_object.remoteid logger.debug("Get Digital Object info URL: %s", url) response = requests.get( url, headers=headers, timeout=mcpclient_settings.AGENTARCHIVES_CLIENT_TIMEOUT) logger.debug("Response: %s %s", response, response.content) body = response.json() # Update url = archivesspace_url + digital_object.remoteid file_version = { "file_uri": handle, "use_statement": config["use_statement"], "xlink_show_attribute": config["xlink_show"], "xlink_actuate_attribute": config["xlink_actuate"], } body["file_versions"].append(file_version) logger.debug("Modified Digital Object: %s", body) response = requests.post( url, headers=headers, json=body, timeout=mcpclient_settings.AGENTARCHIVES_CLIENT_TIMEOUT, ) job.pyprint("Update response:", response, response.content) logger.debug("Response: %s %s", response, response.content) if response.status_code != 200: job.pyprint("Error updating", digital_object.remoteid) return ERROR return COMPLETED
def processAIPThenDeleteMETSFile(path, temp_dir, es_client, delete_existing_data=False): archive_file = os.path.basename(path) # Regex match the UUID - AIP might end with .7z, .tar.bz2, or # something else. match = re.search( r"[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}", archive_file) if match is not None: aip_uuid = match.group() else: return -1 print('Processing AIP', aip_uuid) if delete_existing_data is True: print('Deleting AIP', aip_uuid, 'from aips/aip and aips/aipfile.') elasticSearchFunctions.delete_aip(es_client, aip_uuid) elasticSearchFunctions.delete_aip_files(es_client, aip_uuid) # AIP filenames are <name>-<uuid><extension> # Index of match end is right before the extension subdir = archive_file[:match.end()] aip_name = subdir[:-37] mets_file = "METS." + aip_uuid + ".xml" mets_file_relative_path = os.path.join("data", mets_file) if os.path.isfile(path): mets_file_relative_path = os.path.join(subdir, mets_file_relative_path) path_to_mets = extract_file( archive_path=path, destination_dir=temp_dir, relative_path=mets_file_relative_path) # If AIC, need to extract number of AIPs in AIC to index as well aips_in_aic = None root = etree.parse(path_to_mets) try: aip_type = root.find( "m:dmdSec/m:mdWrap/m:xmlData/dc:dublincore/dc:type", namespaces=NSMAP).text except AttributeError: pass else: if aip_type == "Archival Information Collection": aips_in_aic = get_aips_in_aic(root, path, temp_dir) aip_info = storage_service.get_file_info(uuid=aip_uuid) if aip_info: elasticSearchFunctions.index_aip( client=es_client, uuid=aip_uuid, name=aip_name, filePath=path, pathToMETS=path_to_mets, aips_in_aic=aips_in_aic, identifiers=[], # TODO get these size=aip_info[0]['size'], ) elasticSearchFunctions.index_mets_file_metadata( client=es_client, uuid=aip_uuid, metsFilePath=path_to_mets, index='aips', type_='aipfile', sipName=aip_name, identifiers=[], # TODO get these )
def list_display(request): if "aips" not in settings.SEARCH_ENABLED: return render(request, "archival_storage/list.html") current_page_number = int(request.GET.get("page", 1)) logger.debug("Current page: %s", current_page_number) # get count of AIP files es_client = elasticSearchFunctions.get_client() aip_indexed_file_count = aip_file_count(es_client) # get AIPs order_by = request.GET.get("order_by", "name") sort_by = request.GET.get("sort_by", "up") sort_params = "order_by=" + order_by + "&sort_by=" + sort_by # use raw subfield to sort by name if order_by == "name": order_by = order_by + ".raw" # change sort_by param to ES sort directions if sort_by == "down": sort_by = "desc" else: sort_by = "asc" sort_specification = order_by + ":" + sort_by # get list of UUIDs of AIPs that are deleted or pending deletion aips_deleted_or_pending_deletion = [] should_haves = [{ "match": { "status": "DEL_REQ" } }, { "match": { "status": "DELETED" } }] query = {"query": {"bool": {"should": should_haves}}} deleted_aip_results = es_client.search(body=query, index="aips", _source="uuid,status") for deleted_aip in deleted_aip_results["hits"]["hits"]: aips_deleted_or_pending_deletion.append(deleted_aip["_source"]["uuid"]) # Fetch results and paginate def es_pager(page, page_size): """ Fetch one page of normalized entries from Elasticsearch. :param page: 1-indexed page to fetch :param page_size: Number of entries on a page :return: List of dicts for each entry, where keys and values have been cleaned up """ start = (page - 1) * page_size results = es_client.search( index="aips", body={"query": { "match_all": {} }}, _source="origin,uuid,filePath,created,name,size,encrypted", sort=sort_specification, size=page_size, from_=start, ) return [d["_source"] for d in results["hits"]["hits"]] items_per_page = 10 count = es_client.count(index="aips", body={"query": { "match_all": {} }})["count"] results = LazyPagedSequence(es_pager, page_size=items_per_page, length=count) # Paginate page = helpers.pager(results, items_per_page, current_page_number) # process deletion, etc., and format results aips = [] for aip in page.object_list: # If an AIP was deleted or is pending deletion, react if status changed if aip["uuid"] in aips_deleted_or_pending_deletion: # check with storage server to see current status api_results = storage_service.get_file_info(uuid=aip["uuid"]) try: aip_status = api_results[0]["status"] except IndexError: # Storage service does not know about this AIP # TODO what should happen here? logger.info("AIP not found in storage service: {}".format(aip)) continue # delete AIP metadata in ElasticSearch if AIP has been deleted from the # storage server # TODO: handle this asynchronously if aip_status == "DELETED": elasticSearchFunctions.delete_aip(es_client, aip["uuid"]) elasticSearchFunctions.delete_aip_files(es_client, aip["uuid"]) elif aip_status != "DEL_REQ": # update the status in ElasticSearch for this AIP elasticSearchFunctions.mark_aip_stored(es_client, aip["uuid"]) else: aip_status = "UPLOADED" # Tweak AIP presentation and add to display array if aip_status != "DELETED": aip["status"] = AIP_STATUS_DESCRIPTIONS[aip_status] try: size = "{0:.2f} MB".format(float(aip["size"])) except (TypeError, ValueError): size = "Removed" aip["size"] = size aip["href"] = aip["filePath"].replace(AIPSTOREPATH + "/", "AIPsStore/") aip["date"] = aip["created"] aips.append(aip) total_size = total_size_of_aips(es_client) # Find out which AIPs are encrypted return render( request, "archival_storage/list.html", { "total_size": total_size, "aip_indexed_file_count": aip_indexed_file_count, "aips": aips, "page": page, "search_params": sort_params, }, )
def store_aip(job, aip_destination_uri, aip_path, sip_uuid, sip_name, sip_type): """ Stores an AIP with the storage service. aip_destination_uri = storage service destination URI, should be of purpose AIP Store (AS) aip_path = Full absolute path to the AIP's current location on the local filesystem sip_uuid = UUID of the SIP, which will become the UUID of the AIP sip_name = SIP name. Not used directly, but part of the AIP name Example inputs: storeAIP.py "/api/v1/location/9c2b5bb7-abd6-477b-88e0-57107219dace/" "/var/archivematica/sharedDirectory/currentlyProcessing/ep6-0737708e-9b99-471a-b331-283e2244164f/ep6-0737708e-9b99-471a-b331-283e2244164f.7z" "0737708e-9b99-471a-b331-283e2244164f" "ep6" """ # FIXME Assume current Location is the one set up by default until location # is passed in properly, or use Agent to make sure is correct CP current_location = storage_service.get_location(purpose="CP")[0] # If ``aip_path`` does not exist, this may be a DIP that was not uploaded. # In that case, it will be in the uploadDIP/ directory instead of the # uploadedDIPs/ directory. if not os.path.exists(aip_path): aip_path = get_upload_dip_path(aip_path) # Make aip_path relative to the Location shared_path = os.path.join(current_location["path"], "") # Ensure ends with / relative_aip_path = aip_path.replace(shared_path, "") # Get the package type: AIC or AIP if "SIP" in sip_type or "AIP" in sip_type: # Also matches AIP-REIN package_type = "AIP" elif "AIC" in sip_type: # Also matches AIC-REIN package_type = "AIC" elif "DIP" in sip_type: package_type = "DIP" # Uncompressed directory AIPs must be terminated in a /, # otherwise the storage service will place the directory # inside another directory of the same name. current_path = os.path.basename(aip_path) if os.path.isdir(aip_path) and not aip_path.endswith("/"): relative_aip_path = relative_aip_path + "/" # DIPs cannot share the AIP UUID, as the storage service depends on # having a unique UUID; assign a new one before uploading. # TODO allow mapping the AIP UUID to the DIP UUID for retrieval. related_package_uuid = None if sip_type == "DIP": uuid = str(uuid4()) job.pyprint( "Checking if DIP {} parent AIP has been created...".format(uuid)) # Set related package UUID, so a relationship to the parent AIP can be # created if if AIP has been stored. If the AIP hasn't yet been stored # take note of the DIP's UUID so it the relationship can later be # created when the AIP is stored. try: storage_service.get_file_info(uuid=sip_uuid)[0] # Check existence related_package_uuid = sip_uuid job.pyprint("Parent AIP exists so relationship can be created.") except IndexError: UnitVariable.objects.create( unittype="SIP", unituuid=sip_uuid, variable="relatedPackage", variablevalue=uuid, ) job.pyprint( "Noting DIP UUID {} related to AIP so relationship can be created when AIP is stored." .format(uuid)) else: uuid = sip_uuid try: related_package = UnitVariable.objects.get( unituuid=sip_uuid, variable="relatedPackage") except UnitVariable.DoesNotExist: pass else: related_package_uuid = related_package.variablevalue # If AIP is a directory, calculate size recursively if os.path.isdir(aip_path): size = 0 for dirpath, _, filenames in os.walk(aip_path): for filename in filenames: file_path = os.path.join(dirpath, filename) size += os.path.getsize(file_path) else: size = os.path.getsize(aip_path) # Get the AIP subtype from any DC type attribute supplied by the user for # the AIP. If found, this will replace 'Archival Information Package' in # ``<mets:div TYPE='Archival Information Package'>`` in the pointer file. sip_metadata_uuid = "3e48343d-e2d2-4956-aaa3-b54d26eb9761" try: dc = DublinCore.objects.get(metadataappliestotype_id=sip_metadata_uuid, metadataappliestoidentifier=uuid) except DublinCore.DoesNotExist: aip_subtype = "Archival Information Package" else: aip_subtype = dc.type # Store the AIP try: new_file = _create_file( uuid, current_location, relative_aip_path, aip_destination_uri, current_path, package_type, aip_subtype, size, sip_type, related_package_uuid, ) except StorageServiceCreateFileError as err: errmsg = "{} creation failed: {}.".format(sip_type, err) logger.warning(errmsg) raise Exception(errmsg + " See logs for more details.") message = "Storage Service created {}:\n{}".format(sip_type, pformat(new_file)) logger.info(message) job.pyprint(message) # Once the DIP is stored, remove it from the uploadDIP watched directory as # it will no longer need to be referenced from there by the user or the # system. rmtree_upload_dip_transitory_loc(package_type, aip_path) return 0