def handle(self, *args, **options): """Entry point of the rebuild_transfer_backlog command.""" if not self.confirm(options['no_prompt']): sys.exit(0) transfer_backlog_dir = self.prepdir(options['transfer_backlog_dir']) if not os.path.exists(transfer_backlog_dir): raise CommandError('Directory does not exist: %s', transfer_backlog_dir) self.success('Rebuilding "transfers" index from {}.'.format( transfer_backlog_dir)) # Connect to Elasticsearch. elasticSearchFunctions.setup_reading_from_conf(django_settings) es_client = elasticSearchFunctions.get_client() try: es_info = es_client.info() except Exception as err: raise CommandError("Unable to connect to Elasticsearch: %s" % err) else: self.success('Connected to Elasticsearch node {} (v{}).'.format( es_info['name'], es_info['version']['number'])) self.delete_index(es_client) self.create_index(es_client) self.populate_index(es_client, transfer_backlog_dir) self.success('Indexing complete!')
def setup_es_for_aip_reindexing(cmd, delete_all=False): """Setup for reindexing AIPs. :param cmd: Command object. :param delete_all: Optional arg to delete AIP indices. :returns: ES client. """ if es.AIPS_INDEX not in django_settings.SEARCH_ENABLED: raise CommandError( "The AIPs indexes are not enabled. Please, make sure to " "set the *_SEARCH_ENABLED environment variables to `true` " "to enable the AIPs and Transfers indexes, or to `aips` " "to only enable the AIPs indexes.") try: es.setup_reading_from_conf(django_settings) es_client = es.get_client() except ElasticsearchException as err: raise CommandError( "Unable to connect to Elasticsearch: %s".format(err)) if delete_all: cmd.info("Deleting all AIPs in the 'aips' and 'aipfiles' indices") time.sleep(3) # Time for the user to panic and kill the process. indices = [es.AIPS_INDEX, es.AIP_FILES_INDEX] es_client.indices.delete(",".join(indices), ignore=404) es.create_indexes_if_needed(es_client, indices) return es_client
def call(jobs): with transaction.atomic(): for job in jobs: with job.JobContext(logger=logger): if 'transfers' not in mcpclient_settings.SEARCH_ENABLED: logger.info( 'Skipping indexing: Transfers indexing is currently disabled.' ) job.set_status(0) continue transfer_path = job.args[1] transfer_uuid = job.args[2] try: status = job.args[3] except IndexError: status = '' elasticSearchFunctions.setup_reading_from_conf( mcpclient_settings) client = elasticSearchFunctions.get_client() job.set_status( elasticSearchFunctions.index_transfer_and_files( client, transfer_uuid, transfer_path, status=status, printfn=job.pyprint, ))
def post_store_hook(job, sip_uuid): """ Hook for doing any work after an AIP is stored successfully. """ update_es = "transfers" in mcpclient_settings.SEARCH_ENABLED if update_es: elasticSearchFunctions.setup_reading_from_conf(mcpclient_settings) client = elasticSearchFunctions.get_client() else: logger.info( "Skipping indexing: Transfers indexing is currently disabled.") # SIP ARRANGEMENT # Mark files in this SIP as in an AIP (aip_created) file_uuids = models.File.objects.filter(sip=sip_uuid).values_list( "uuid", flat=True) models.SIPArrange.objects.filter(file_uuid__in=file_uuids).update( aip_created=True) # Check if any of component transfers are completely stored # TODO Storage service should index AIPs, knows when to update ES transfer_uuids = set( models.SIPArrange.objects.filter(file_uuid__in=file_uuids).values_list( "transfer_uuid", flat=True)) for transfer_uuid in transfer_uuids: job.pyprint("Checking if transfer", transfer_uuid, "is fully stored...") arranged_uuids = set( models.SIPArrange.objects.filter( transfer_uuid=transfer_uuid).filter( aip_created=True).values_list("file_uuid", flat=True)) backlog_uuids = set( models.File.objects.filter(transfer=transfer_uuid).values_list( "uuid", flat=True)) # If all backlog UUIDs have been arranged if arranged_uuids == backlog_uuids: job.pyprint( "Transfer", transfer_uuid, "fully stored, sending delete request to storage service, deleting from transfer backlog", ) # Submit delete req to SS (not actually delete), remove from ES storage_service.request_file_deletion( uuid=transfer_uuid, user_id=0, user_email="archivematica system", reason_for_deletion="All files in Transfer are now in AIPs.", ) if update_es: elasticSearchFunctions.remove_sip_transfer_files( client, transfer_uuid) # DSPACE HANDLE TO ARCHIVESSPACE dspace_handle_to_archivesspace(job, sip_uuid) # POST-STORE CALLBACK storage_service.post_store_aip_callback(sip_uuid)
def _index_transfer(job, transfer_id, transfer_path, size): """Index the transfer and its files in Elasticsearch.""" if "transfers" not in mcpclient_settings.SEARCH_ENABLED: logger.info("Skipping indexing:" " Transfers indexing is currently disabled.") return elasticSearchFunctions.setup_reading_from_conf(mcpclient_settings) client = elasticSearchFunctions.get_client() elasticSearchFunctions.index_transfer_and_files( client, transfer_id, transfer_path, size, printfn=job.pyprint )
def index_aip(job): """Write AIP information to ElasticSearch. """ sip_uuid = job.args[1] # %SIPUUID% sip_name = job.args[2] # %SIPName% sip_staging_path = job.args[3] # %SIPDirectory% sip_type = job.args[4] # %SIPType% if "aips" not in mcpclient_settings.SEARCH_ENABLED: logger.info("Skipping indexing: AIPs indexing is currently disabled.") return 0 elasticSearchFunctions.setup_reading_from_conf(mcpclient_settings) client = elasticSearchFunctions.get_client() aip_info = storage_service.get_file_info(uuid=sip_uuid) job.pyprint("AIP info:", aip_info) aip_info = aip_info[0] mets_staging_path = os.path.join(sip_staging_path, "METS.{}.xml".format(sip_uuid)) identifiers = get_identifiers(job, sip_staging_path) # If this is an AIC, find the number of AIP stored in it and index that aips_in_aic = None if sip_type == "AIC": try: uv = UnitVariable.objects.get(unittype="SIP", unituuid=sip_uuid, variable="AIPsinAIC") aips_in_aic = uv.variablevalue except UnitVariable.DoesNotExist: pass # Delete ES index before creating new one if reingesting if "REIN" in sip_type: job.pyprint( "Deleting outdated entry for AIP and AIP files with UUID", sip_uuid, "from archival storage", ) elasticSearchFunctions.delete_aip(client, sip_uuid) elasticSearchFunctions.delete_aip_files(client, sip_uuid) job.pyprint("Indexing AIP and AIP files") # Even though we treat MODS identifiers as SIP-level, we need to index them # here because the archival storage tab actually searches on the # aips/aipfile index. ret = elasticSearchFunctions.index_aip_and_files( client=client, uuid=sip_uuid, aip_stored_path=aip_info["current_full_path"], mets_staging_path=mets_staging_path, name=sip_name, aip_size=aip_info["size"], aips_in_aic=aips_in_aic, identifiers=identifiers, encrypted=aip_info["encrypted"], printfn=job.pyprint, ) if ret == 1: job.pyprint("Error indexing AIP and AIP files", file=sys.stderr) return ret
def handle(self, *args, **options): """Entry point of the rebuild_transfer_backlog command.""" # Check that the `transfers` part of the search is enabled if es.TRANSFERS_INDEX not in django_settings.SEARCH_ENABLED: print( "The Transfers indexes are not enabled. Please, make sure to " "set the *_SEARCH_ENABLED environment variables to `true` " "to enable the Transfers and AIPs indexes, or to `transfers` " "to only enable the Transfers indexes.") sys.exit(1) if not self.confirm(options["no_prompt"]): sys.exit(0) # Ignore elasticsearch-py logging events unless they're errors. logging.getLogger("elasticsearch").setLevel(logging.ERROR) logging.getLogger("archivematica.common").setLevel(logging.ERROR) transfer_backlog_dir = self.prepdir(options["transfer_backlog_dir"]) if options["from_storage_service"]: self.info( 'Rebuilding "transfers" index from packages in Storage Service.' ) else: if not os.path.exists(transfer_backlog_dir): raise CommandError("Directory does not exist: %s", transfer_backlog_dir) self.info('Rebuilding "transfers" index from {}.'.format( transfer_backlog_dir)) # Connect to Elasticsearch. es.setup_reading_from_conf(django_settings) es_client = es.get_client() try: es_info = es_client.info() except Exception as err: raise CommandError("Unable to connect to Elasticsearch: %s" % err) else: self.info("Connected to Elasticsearch node {} (v{}).".format( es_info["name"], es_info["version"]["number"])) indexes = [es.TRANSFERS_INDEX, es.TRANSFER_FILES_INDEX] self.delete_indexes(es_client, indexes) self.create_indexes(es_client, indexes) if options["from_storage_service"]: pipeline_uuid = options["pipeline"] self.populate_data_from_storage_service(es_client, pipeline_uuid) else: self.populate_data_from_files(es_client, transfer_backlog_dir)
def call(jobs): for job in jobs: with job.JobContext(logger=logger): aip_uuid = job.args[1] if "aips" not in mcpclient_settings.SEARCH_ENABLED: logger.info("Skipping. AIPs indexing is currently disabled.") return elasticSearchFunctions.setup_reading_from_conf(mcpclient_settings) client = elasticSearchFunctions.get_client() logger.info("Removing indexed files for AIP %s...", aip_uuid) elasticSearchFunctions.delete_aip_files(client, aip_uuid)
def handle(self, *args, **options): """Entry point of the rebuild_transfer_backlog command.""" # Check that the `transfers` part of the search is enabled if 'transfers' not in django_settings.SEARCH_ENABLED: print( "The Transfers indexes are not enabled. Please, make sure to " "set the *_SEARCH_ENABLED environment variables to `true` " "to enable the Transfers and AIPs indexes, or to `transfers` " "to only enable the Transfers indexes.") sys.exit(1) if not self.confirm(options['no_prompt']): sys.exit(0) transfer_backlog_dir = self.prepdir(options['transfer_backlog_dir']) if not os.path.exists(transfer_backlog_dir): raise CommandError('Directory does not exist: %s', transfer_backlog_dir) self.success('Rebuilding "transfers" index from {}.'.format( transfer_backlog_dir)) # Connect to Elasticsearch. elasticSearchFunctions.setup_reading_from_conf(django_settings) es_client = elasticSearchFunctions.get_client() try: es_info = es_client.info() except Exception as err: raise CommandError("Unable to connect to Elasticsearch: %s" % err) else: self.success('Connected to Elasticsearch node {} (v{}).'.format( es_info['name'], es_info['version']['number'])) indexes = ['transfers', 'transferfiles'] self.delete_indexes(es_client, indexes) self.create_indexes(es_client, indexes) self.populate_indexes(es_client, transfer_backlog_dir) self.success('Indexing complete!')
# -*- coding: utf-8 -*- from __future__ import absolute_import import django from django.conf import settings from django.core.wsgi import get_wsgi_application django.setup() import elasticSearchFunctions application = get_wsgi_application() # Set up Elasticsearch client elasticSearchFunctions.setup_reading_from_conf(settings)
# archivematicaCommon from custom_handlers import get_script_logger import elasticSearchFunctions from django.conf import settings as mcpclient_settings logger = get_script_logger( 'archivematica.mcp.client.elasticSearchIndexProcessTransfer') if __name__ == '__main__': if not mcpclient_settings.SEARCH_ENABLED: logger.info('Skipping indexing: indexing is currently disabled.') sys.exit(0) transfer_path = sys.argv[1] transfer_uuid = sys.argv[2] try: status = sys.argv[3] except IndexError: status = '' elasticSearchFunctions.setup_reading_from_conf(mcpclient_settings) client = elasticSearchFunctions.get_client() sys.exit( elasticSearchFunctions.index_files(client, 'transfers', 'transferfile', transfer_uuid, transfer_path, status=status))
def handle(self, *args, **options): # Check that the `aips` part of the search is enabled if "aips" not in django_settings.SEARCH_ENABLED: print("The AIPs indexes are not enabled. Please, make sure to " "set the *_SEARCH_ENABLED environment variables to `true` " "to enable the AIPs and Transfers indexes, or to `aips` " "to only enable the AIPs indexes.") sys.exit(1) # Check root directory exists if not os.path.isdir(options["rootdir"]): print("AIP store location doesn't exist.") sys.exit(1) # Verify ES is accessible elasticSearchFunctions.setup_reading_from_conf(django_settings) es_client = elasticSearchFunctions.get_client() try: es_client.info() except Exception: print("Error: Elasticsearch may not be running.") sys.exit(1) # Delete existing data also clears AIPS not found in the # provided directory if options["delete_all"]: print("Deleting all AIPs in the AIP index") time.sleep(3) # Time for the user to panic and kill the process indexes = ["aips", "aipfiles"] es_client.indices.delete(",".join(indexes), ignore=404) elasticSearchFunctions.create_indexes_if_needed(es_client, indexes) if not options["uuid"]: print("Rebuilding AIPS index from AIPS in", options["rootdir"]) else: print("Rebuilding AIP UUID", options["uuid"]) temp_dir = tempfile.mkdtemp() count = 0 name_regex = r"-[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}" dir_regex = r"-[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$" for root, directories, files in scandir.walk(options["rootdir"]): # Ignore top-level directories inside ``rootdir`` that are not hex, # e.g. we walk ``0771`` but we're ignoring ``transferBacklog``. if root == options["rootdir"]: directories[:] = [ d for d in directories if is_hex(d) and len(d) == 4 ] # Uncompressed AIPs for directory in directories: # Check if dir name matches AIP name format match = re.search(dir_regex, directory) if not match: continue # If running on a single AIP, skip all others if options["uuid"] and options["uuid"].lower( ) not in directory.lower(): continue ret = processAIPThenDeleteMETSFile( path=os.path.join(root, directory), temp_dir=temp_dir, es_client=es_client, delete_existing_data=options["delete"], ) # Don't recurse into this directory directories = directories.remove(directory) # Update count on successful index if ret == 0: count += 1 # Compressed AIPs for filename in files: # Check if filename matches AIP name format match = re.search(name_regex, filename) if not match: continue # If running on a single AIP, skip all others if options["uuid"] and options["uuid"].lower( ) not in filename.lower(): continue ret = processAIPThenDeleteMETSFile( path=os.path.join(root, filename), temp_dir=temp_dir, es_client=es_client, delete_existing_data=options["delete"], ) # Update count on successful index if ret == 0: count += 1 print("Cleaning up") shutil.rmtree(temp_dir) print("Indexing complete. Indexed", count, "AIP(s).")
def handle(self, *args, **options): # Check that the AIPs index is enabled before proceeding. if es.AIPS_INDEX not in settings.SEARCH_ENABLED: self.error( "The AIPs indexes are not enabled. Please, make sure to " "set the *_SEARCH_ENABLED environment variables to `true` " "to enable the AIPs and Transfers indexes, or to `aips` " "to only enable the AIPs indexes.") sys.exit(1) try: es.setup_reading_from_conf(settings) es_client = es.get_client() except ElasticsearchException: self.error("Error: Elasticsearch may not be running.") sys.exit(1) # Update the AIPs index mappings. es_client.indices.put_mapping( index=es.AIPS_INDEX, doc_type=es.DOC_TYPE, body={ "properties": { es.ES_FIELD_ACCESSION_IDS: { "type": "keyword" }, es.ES_FIELD_STATUS: { "type": "keyword" }, es.ES_FIELD_FILECOUNT: { "type": "integer" }, es.ES_FIELD_LOCATION: { "type": "keyword" }, } }, ) # Update the AIP files index mapping. es_client.indices.put_mapping( index=es.AIP_FILES_INDEX, doc_type=es.DOC_TYPE, body={ "properties": { "accessionid": { "type": "keyword" }, es.ES_FIELD_STATUS: { "type": "keyword" }, "filePath": { "type": "text", "analyzer": "file_path_and_name", "fields": { "raw": { "type": "keyword" } }, }, } }, ) # Perform an update by query on the aipfiles index to populate # the filePath.raw subfield from existing text values. We do # not specify a query to ensure that all documents are updated. es_client.update_by_query(es.AIP_FILES_INDEX)
def index_aip(): """ Write AIP information to ElasticSearch. """ sip_uuid = sys.argv[1] # %SIPUUID% sip_name = sys.argv[2] # %SIPName% sip_path = sys.argv[3] # %SIPDirectory% sip_type = sys.argv[4] # %SIPType% if not mcpclient_settings.SEARCH_ENABLED: logger.info('Skipping indexing: indexing is currently disabled.') return 0 elasticSearchFunctions.setup_reading_from_conf(mcpclient_settings) client = elasticSearchFunctions.get_client() print('SIP UUID:', sip_uuid) aip_info = storage_service.get_file_info(uuid=sip_uuid) print('AIP info:', aip_info) aip_info = aip_info[0] mets_name = 'METS.{}.xml'.format(sip_uuid) mets_path = os.path.join(sip_path, mets_name) identifiers = get_identifiers(sip_path) # If this is an AIC, find the number of AIP stored in it and index that aips_in_aic = None if sip_type == "AIC": try: uv = UnitVariable.objects.get(unittype="SIP", unituuid=sip_uuid, variable="AIPsinAIC") aips_in_aic = uv.variablevalue except UnitVariable.DoesNotExist: pass print('Indexing AIP info') # Delete ES index before creating new one if reingesting if 'REIN' in sip_type: print('Deleting outdated entry for AIP and AIP files with UUID', sip_uuid, 'from archival storage') elasticSearchFunctions.delete_aip(client, sip_uuid) elasticSearchFunctions.delete_aip_files(client, sip_uuid) # Index AIP elasticSearchFunctions.index_aip(client, sip_uuid, sip_name, aip_info['current_full_path'], mets_path, size=aip_info['size'], aips_in_aic=aips_in_aic, identifiers=identifiers, encrypted=aip_info['encrypted']) # Index AIP files print('Indexing AIP files') # Even though we treat MODS identifiers as SIP-level, we need to index them # here because the archival storage tab actually searches on the # aips/aipfile index. exitCode = elasticSearchFunctions.index_files( client, index='aips', type_='aipfile', uuid=sip_uuid, pathToArchive=sip_path, identifiers=identifiers, sipName=sip_name, ) if exitCode == 1: print('Error indexing AIP files', file=sys.stderr) return 1 return 0
def handle(self, *args, **options): # Check root directory exists if not os.path.isdir(options['rootdir']): print("AIP store location doesn't exist.") sys.exit(1) # Verify ES is accessible elasticSearchFunctions.setup_reading_from_conf(django_settings) es_client = elasticSearchFunctions.get_client() try: es_client.info() except Exception: print("Error: Elasticsearch may not be running.") sys.exit(1) # Delete existing data also clears AIPS not found in the # provided directory if options['delete_all']: print('Deleting all AIPs in the AIP index') time.sleep(3) # Time for the user to panic and kill the process es_client.indices.delete('aips', ignore=404) elasticSearchFunctions.create_indexes_if_needed(es_client) if not options['uuid']: print("Rebuilding AIPS index from AIPS in", options['rootdir']) else: print("Rebuilding AIP UUID", options['uuid']) temp_dir = tempfile.mkdtemp() count = 0 name_regex = \ r"-[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}" dir_regex = \ r"-[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$" for root, directories, files in os.walk(options['rootdir']): # Ignore top-level directories inside ``rootdir`` that are not hex, # e.g. we walk ``0771`` but we're ignoring ``transferBacklog``. if root == options['rootdir']: directories[:] = [ d for d in directories if is_hex(d) and len(d) == 4 ] # Uncompressed AIPs for directory in directories: # Check if dir name matches AIP name format match = re.search(dir_regex, directory) if not match: continue # If running on a single AIP, skip all others if options['uuid'] and \ options['uuid'].lower() not in directory.lower(): continue count += 1 processAIPThenDeleteMETSFile( path=os.path.join(root, directory), temp_dir=temp_dir, es_client=es_client, delete_existing_data=options['delete'], ) # Don't recurse into this directory directories = directories.remove(directory) # Compressed AIPs for filename in files: # Check if filename matches AIP name format match = re.search(name_regex, filename) if not match: continue # If running on a single AIP, skip all others if options['uuid'] and \ options['uuid'].lower() not in filename.lower(): continue count += 1 processAIPThenDeleteMETSFile( path=os.path.join(root, filename), temp_dir=temp_dir, es_client=es_client, delete_existing_data=options['delete'], ) print("Cleaning up") shutil.rmtree(temp_dir) print("Indexing complete. Indexed", count, "AIPs")