def handle(self, *args, **options):
        """Entry point of the rebuild_transfer_backlog command."""
        if not self.confirm(options['no_prompt']):
            sys.exit(0)

        transfer_backlog_dir = self.prepdir(options['transfer_backlog_dir'])
        if not os.path.exists(transfer_backlog_dir):
            raise CommandError('Directory does not exist: %s',
                               transfer_backlog_dir)
        self.success('Rebuilding "transfers" index from {}.'.format(
            transfer_backlog_dir))

        # Connect to Elasticsearch.
        elasticSearchFunctions.setup_reading_from_conf(django_settings)
        es_client = elasticSearchFunctions.get_client()
        try:
            es_info = es_client.info()
        except Exception as err:
            raise CommandError("Unable to connect to Elasticsearch: %s" % err)
        else:
            self.success('Connected to Elasticsearch node {} (v{}).'.format(
                es_info['name'], es_info['version']['number']))

        self.delete_index(es_client)
        self.create_index(es_client)
        self.populate_index(es_client, transfer_backlog_dir)
        self.success('Indexing complete!')
예제 #2
0
def setup_es_for_aip_reindexing(cmd, delete_all=False):
    """Setup for reindexing AIPs.

    :param cmd: Command object.
    :param delete_all: Optional arg to delete AIP indices.

    :returns: ES client.
    """
    if es.AIPS_INDEX not in django_settings.SEARCH_ENABLED:
        raise CommandError(
            "The AIPs indexes are not enabled. Please, make sure to "
            "set the *_SEARCH_ENABLED environment variables to `true` "
            "to enable the AIPs and Transfers indexes, or to `aips` "
            "to only enable the AIPs indexes.")

    try:
        es.setup_reading_from_conf(django_settings)
        es_client = es.get_client()
    except ElasticsearchException as err:
        raise CommandError(
            "Unable to connect to Elasticsearch: %s".format(err))

    if delete_all:
        cmd.info("Deleting all AIPs in the 'aips' and 'aipfiles' indices")
        time.sleep(3)  # Time for the user to panic and kill the process.
        indices = [es.AIPS_INDEX, es.AIP_FILES_INDEX]
        es_client.indices.delete(",".join(indices), ignore=404)
        es.create_indexes_if_needed(es_client, indices)

    return es_client
def call(jobs):
    with transaction.atomic():
        for job in jobs:
            with job.JobContext(logger=logger):
                if 'transfers' not in mcpclient_settings.SEARCH_ENABLED:
                    logger.info(
                        'Skipping indexing: Transfers indexing is currently disabled.'
                    )
                    job.set_status(0)
                    continue

                transfer_path = job.args[1]
                transfer_uuid = job.args[2]
                try:
                    status = job.args[3]
                except IndexError:
                    status = ''

                elasticSearchFunctions.setup_reading_from_conf(
                    mcpclient_settings)
                client = elasticSearchFunctions.get_client()
                job.set_status(
                    elasticSearchFunctions.index_transfer_and_files(
                        client,
                        transfer_uuid,
                        transfer_path,
                        status=status,
                        printfn=job.pyprint,
                    ))
예제 #4
0
def post_store_hook(job, sip_uuid):
    """
    Hook for doing any work after an AIP is stored successfully.
    """
    update_es = "transfers" in mcpclient_settings.SEARCH_ENABLED
    if update_es:
        elasticSearchFunctions.setup_reading_from_conf(mcpclient_settings)
        client = elasticSearchFunctions.get_client()
    else:
        logger.info(
            "Skipping indexing: Transfers indexing is currently disabled.")

    # SIP ARRANGEMENT

    # Mark files in this SIP as in an AIP (aip_created)
    file_uuids = models.File.objects.filter(sip=sip_uuid).values_list(
        "uuid", flat=True)
    models.SIPArrange.objects.filter(file_uuid__in=file_uuids).update(
        aip_created=True)

    # Check if any of component transfers are completely stored
    # TODO Storage service should index AIPs, knows when to update ES
    transfer_uuids = set(
        models.SIPArrange.objects.filter(file_uuid__in=file_uuids).values_list(
            "transfer_uuid", flat=True))
    for transfer_uuid in transfer_uuids:
        job.pyprint("Checking if transfer", transfer_uuid,
                    "is fully stored...")
        arranged_uuids = set(
            models.SIPArrange.objects.filter(
                transfer_uuid=transfer_uuid).filter(
                    aip_created=True).values_list("file_uuid", flat=True))
        backlog_uuids = set(
            models.File.objects.filter(transfer=transfer_uuid).values_list(
                "uuid", flat=True))
        # If all backlog UUIDs have been arranged
        if arranged_uuids == backlog_uuids:
            job.pyprint(
                "Transfer",
                transfer_uuid,
                "fully stored, sending delete request to storage service, deleting from transfer backlog",
            )
            # Submit delete req to SS (not actually delete), remove from ES
            storage_service.request_file_deletion(
                uuid=transfer_uuid,
                user_id=0,
                user_email="archivematica system",
                reason_for_deletion="All files in Transfer are now in AIPs.",
            )
            if update_es:
                elasticSearchFunctions.remove_sip_transfer_files(
                    client, transfer_uuid)

    # DSPACE HANDLE TO ARCHIVESSPACE
    dspace_handle_to_archivesspace(job, sip_uuid)

    # POST-STORE CALLBACK
    storage_service.post_store_aip_callback(sip_uuid)
예제 #5
0
def _index_transfer(job, transfer_id, transfer_path, size):
    """Index the transfer and its files in Elasticsearch."""
    if "transfers" not in mcpclient_settings.SEARCH_ENABLED:
        logger.info("Skipping indexing:" " Transfers indexing is currently disabled.")
        return
    elasticSearchFunctions.setup_reading_from_conf(mcpclient_settings)
    client = elasticSearchFunctions.get_client()
    elasticSearchFunctions.index_transfer_and_files(
        client, transfer_id, transfer_path, size, printfn=job.pyprint
    )
예제 #6
0
def index_aip(job):
    """Write AIP information to ElasticSearch. """
    sip_uuid = job.args[1]  # %SIPUUID%
    sip_name = job.args[2]  # %SIPName%
    sip_staging_path = job.args[3]  # %SIPDirectory%
    sip_type = job.args[4]  # %SIPType%
    if "aips" not in mcpclient_settings.SEARCH_ENABLED:
        logger.info("Skipping indexing: AIPs indexing is currently disabled.")
        return 0
    elasticSearchFunctions.setup_reading_from_conf(mcpclient_settings)
    client = elasticSearchFunctions.get_client()
    aip_info = storage_service.get_file_info(uuid=sip_uuid)
    job.pyprint("AIP info:", aip_info)
    aip_info = aip_info[0]
    mets_staging_path = os.path.join(sip_staging_path,
                                     "METS.{}.xml".format(sip_uuid))
    identifiers = get_identifiers(job, sip_staging_path)
    # If this is an AIC, find the number of AIP stored in it and index that
    aips_in_aic = None
    if sip_type == "AIC":
        try:
            uv = UnitVariable.objects.get(unittype="SIP",
                                          unituuid=sip_uuid,
                                          variable="AIPsinAIC")
            aips_in_aic = uv.variablevalue
        except UnitVariable.DoesNotExist:
            pass
    # Delete ES index before creating new one if reingesting
    if "REIN" in sip_type:
        job.pyprint(
            "Deleting outdated entry for AIP and AIP files with UUID",
            sip_uuid,
            "from archival storage",
        )
        elasticSearchFunctions.delete_aip(client, sip_uuid)
        elasticSearchFunctions.delete_aip_files(client, sip_uuid)
    job.pyprint("Indexing AIP and AIP files")
    # Even though we treat MODS identifiers as SIP-level, we need to index them
    # here because the archival storage tab actually searches on the
    # aips/aipfile index.
    ret = elasticSearchFunctions.index_aip_and_files(
        client=client,
        uuid=sip_uuid,
        aip_stored_path=aip_info["current_full_path"],
        mets_staging_path=mets_staging_path,
        name=sip_name,
        aip_size=aip_info["size"],
        aips_in_aic=aips_in_aic,
        identifiers=identifiers,
        encrypted=aip_info["encrypted"],
        printfn=job.pyprint,
    )
    if ret == 1:
        job.pyprint("Error indexing AIP and AIP files", file=sys.stderr)
    return ret
    def handle(self, *args, **options):
        """Entry point of the rebuild_transfer_backlog command."""
        # Check that the `transfers` part of the search is enabled
        if es.TRANSFERS_INDEX not in django_settings.SEARCH_ENABLED:
            print(
                "The Transfers indexes are not enabled. Please, make sure to "
                "set the *_SEARCH_ENABLED environment variables to `true` "
                "to enable the Transfers and AIPs indexes, or to `transfers` "
                "to only enable the Transfers indexes.")
            sys.exit(1)

        if not self.confirm(options["no_prompt"]):
            sys.exit(0)

        # Ignore elasticsearch-py logging events unless they're errors.
        logging.getLogger("elasticsearch").setLevel(logging.ERROR)
        logging.getLogger("archivematica.common").setLevel(logging.ERROR)

        transfer_backlog_dir = self.prepdir(options["transfer_backlog_dir"])
        if options["from_storage_service"]:
            self.info(
                'Rebuilding "transfers" index from packages in Storage Service.'
            )
        else:
            if not os.path.exists(transfer_backlog_dir):
                raise CommandError("Directory does not exist: %s",
                                   transfer_backlog_dir)
            self.info('Rebuilding "transfers" index from {}.'.format(
                transfer_backlog_dir))

        # Connect to Elasticsearch.
        es.setup_reading_from_conf(django_settings)
        es_client = es.get_client()
        try:
            es_info = es_client.info()
        except Exception as err:
            raise CommandError("Unable to connect to Elasticsearch: %s" % err)
        else:
            self.info("Connected to Elasticsearch node {} (v{}).".format(
                es_info["name"], es_info["version"]["number"]))

        indexes = [es.TRANSFERS_INDEX, es.TRANSFER_FILES_INDEX]
        self.delete_indexes(es_client, indexes)
        self.create_indexes(es_client, indexes)

        if options["from_storage_service"]:
            pipeline_uuid = options["pipeline"]
            self.populate_data_from_storage_service(es_client, pipeline_uuid)
        else:
            self.populate_data_from_files(es_client, transfer_backlog_dir)
예제 #8
0
def call(jobs):
    for job in jobs:
        with job.JobContext(logger=logger):
            aip_uuid = job.args[1]

            if "aips" not in mcpclient_settings.SEARCH_ENABLED:
                logger.info("Skipping. AIPs indexing is currently disabled.")
                return

            elasticSearchFunctions.setup_reading_from_conf(mcpclient_settings)
            client = elasticSearchFunctions.get_client()

            logger.info("Removing indexed files for AIP %s...", aip_uuid)
            elasticSearchFunctions.delete_aip_files(client, aip_uuid)
    def handle(self, *args, **options):
        """Entry point of the rebuild_transfer_backlog command."""
        # Check that the `transfers` part of the search is enabled
        if 'transfers' not in django_settings.SEARCH_ENABLED:
            print(
                "The Transfers indexes are not enabled. Please, make sure to "
                "set the *_SEARCH_ENABLED environment variables to `true` "
                "to enable the Transfers and AIPs indexes, or to `transfers` "
                "to only enable the Transfers indexes.")
            sys.exit(1)

        if not self.confirm(options['no_prompt']):
            sys.exit(0)

        transfer_backlog_dir = self.prepdir(options['transfer_backlog_dir'])
        if not os.path.exists(transfer_backlog_dir):
            raise CommandError('Directory does not exist: %s',
                               transfer_backlog_dir)
        self.success('Rebuilding "transfers" index from {}.'.format(
            transfer_backlog_dir))

        # Connect to Elasticsearch.
        elasticSearchFunctions.setup_reading_from_conf(django_settings)
        es_client = elasticSearchFunctions.get_client()
        try:
            es_info = es_client.info()
        except Exception as err:
            raise CommandError("Unable to connect to Elasticsearch: %s" % err)
        else:
            self.success('Connected to Elasticsearch node {} (v{}).'.format(
                es_info['name'], es_info['version']['number']))

        indexes = ['transfers', 'transferfiles']
        self.delete_indexes(es_client, indexes)
        self.create_indexes(es_client, indexes)
        self.populate_indexes(es_client, transfer_backlog_dir)
        self.success('Indexing complete!')
예제 #10
0
# -*- coding: utf-8 -*-
from __future__ import absolute_import

import django
from django.conf import settings
from django.core.wsgi import get_wsgi_application

django.setup()
import elasticSearchFunctions

application = get_wsgi_application()

# Set up Elasticsearch client
elasticSearchFunctions.setup_reading_from_conf(settings)
# archivematicaCommon
from custom_handlers import get_script_logger
import elasticSearchFunctions

from django.conf import settings as mcpclient_settings

logger = get_script_logger(
    'archivematica.mcp.client.elasticSearchIndexProcessTransfer')

if __name__ == '__main__':
    if not mcpclient_settings.SEARCH_ENABLED:
        logger.info('Skipping indexing: indexing is currently disabled.')
        sys.exit(0)

    transfer_path = sys.argv[1]
    transfer_uuid = sys.argv[2]
    try:
        status = sys.argv[3]
    except IndexError:
        status = ''

    elasticSearchFunctions.setup_reading_from_conf(mcpclient_settings)
    client = elasticSearchFunctions.get_client()
    sys.exit(
        elasticSearchFunctions.index_files(client,
                                           'transfers',
                                           'transferfile',
                                           transfer_uuid,
                                           transfer_path,
                                           status=status))
예제 #12
0
    def handle(self, *args, **options):
        # Check that the `aips` part of the search is enabled
        if "aips" not in django_settings.SEARCH_ENABLED:
            print("The AIPs indexes are not enabled. Please, make sure to "
                  "set the *_SEARCH_ENABLED environment variables to `true` "
                  "to enable the AIPs and Transfers indexes, or to `aips` "
                  "to only enable the AIPs indexes.")
            sys.exit(1)

        # Check root directory exists
        if not os.path.isdir(options["rootdir"]):
            print("AIP store location doesn't exist.")
            sys.exit(1)

        # Verify ES is accessible
        elasticSearchFunctions.setup_reading_from_conf(django_settings)
        es_client = elasticSearchFunctions.get_client()

        try:
            es_client.info()
        except Exception:
            print("Error: Elasticsearch may not be running.")
            sys.exit(1)

        # Delete existing data also clears AIPS not found in the
        # provided directory
        if options["delete_all"]:
            print("Deleting all AIPs in the AIP index")
            time.sleep(3)  # Time for the user to panic and kill the process
            indexes = ["aips", "aipfiles"]
            es_client.indices.delete(",".join(indexes), ignore=404)
            elasticSearchFunctions.create_indexes_if_needed(es_client, indexes)

        if not options["uuid"]:
            print("Rebuilding AIPS index from AIPS in", options["rootdir"])
        else:
            print("Rebuilding AIP UUID", options["uuid"])

        temp_dir = tempfile.mkdtemp()
        count = 0
        name_regex = r"-[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}"
        dir_regex = r"-[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$"

        for root, directories, files in scandir.walk(options["rootdir"]):
            # Ignore top-level directories inside ``rootdir`` that are not hex,
            # e.g. we walk ``0771`` but we're ignoring ``transferBacklog``.
            if root == options["rootdir"]:
                directories[:] = [
                    d for d in directories if is_hex(d) and len(d) == 4
                ]

            # Uncompressed AIPs
            for directory in directories:
                # Check if dir name matches AIP name format
                match = re.search(dir_regex, directory)
                if not match:
                    continue
                # If running on a single AIP, skip all others
                if options["uuid"] and options["uuid"].lower(
                ) not in directory.lower():
                    continue
                ret = processAIPThenDeleteMETSFile(
                    path=os.path.join(root, directory),
                    temp_dir=temp_dir,
                    es_client=es_client,
                    delete_existing_data=options["delete"],
                )
                # Don't recurse into this directory
                directories = directories.remove(directory)
                # Update count on successful index
                if ret == 0:
                    count += 1

            # Compressed AIPs
            for filename in files:
                # Check if filename matches AIP name format
                match = re.search(name_regex, filename)
                if not match:
                    continue
                # If running on a single AIP, skip all others
                if options["uuid"] and options["uuid"].lower(
                ) not in filename.lower():
                    continue
                ret = processAIPThenDeleteMETSFile(
                    path=os.path.join(root, filename),
                    temp_dir=temp_dir,
                    es_client=es_client,
                    delete_existing_data=options["delete"],
                )
                # Update count on successful index
                if ret == 0:
                    count += 1

        print("Cleaning up")

        shutil.rmtree(temp_dir)

        print("Indexing complete. Indexed", count, "AIP(s).")
예제 #13
0
    def handle(self, *args, **options):
        # Check that the AIPs index is enabled before proceeding.
        if es.AIPS_INDEX not in settings.SEARCH_ENABLED:
            self.error(
                "The AIPs indexes are not enabled. Please, make sure to "
                "set the *_SEARCH_ENABLED environment variables to `true` "
                "to enable the AIPs and Transfers indexes, or to `aips` "
                "to only enable the AIPs indexes.")
            sys.exit(1)

        try:
            es.setup_reading_from_conf(settings)
            es_client = es.get_client()
        except ElasticsearchException:
            self.error("Error: Elasticsearch may not be running.")
            sys.exit(1)

        # Update the AIPs index mappings.
        es_client.indices.put_mapping(
            index=es.AIPS_INDEX,
            doc_type=es.DOC_TYPE,
            body={
                "properties": {
                    es.ES_FIELD_ACCESSION_IDS: {
                        "type": "keyword"
                    },
                    es.ES_FIELD_STATUS: {
                        "type": "keyword"
                    },
                    es.ES_FIELD_FILECOUNT: {
                        "type": "integer"
                    },
                    es.ES_FIELD_LOCATION: {
                        "type": "keyword"
                    },
                }
            },
        )

        # Update the AIP files index mapping.
        es_client.indices.put_mapping(
            index=es.AIP_FILES_INDEX,
            doc_type=es.DOC_TYPE,
            body={
                "properties": {
                    "accessionid": {
                        "type": "keyword"
                    },
                    es.ES_FIELD_STATUS: {
                        "type": "keyword"
                    },
                    "filePath": {
                        "type": "text",
                        "analyzer": "file_path_and_name",
                        "fields": {
                            "raw": {
                                "type": "keyword"
                            }
                        },
                    },
                }
            },
        )

        # Perform an update by query on the aipfiles index to populate
        # the filePath.raw subfield from existing text values. We do
        # not specify a query to ensure that all documents are updated.
        es_client.update_by_query(es.AIP_FILES_INDEX)
예제 #14
0
def index_aip():
    """ Write AIP information to ElasticSearch. """
    sip_uuid = sys.argv[1]  # %SIPUUID%
    sip_name = sys.argv[2]  # %SIPName%
    sip_path = sys.argv[3]  # %SIPDirectory%
    sip_type = sys.argv[4]  # %SIPType%

    if not mcpclient_settings.SEARCH_ENABLED:
        logger.info('Skipping indexing: indexing is currently disabled.')
        return 0

    elasticSearchFunctions.setup_reading_from_conf(mcpclient_settings)
    client = elasticSearchFunctions.get_client()

    print('SIP UUID:', sip_uuid)
    aip_info = storage_service.get_file_info(uuid=sip_uuid)
    print('AIP info:', aip_info)
    aip_info = aip_info[0]

    mets_name = 'METS.{}.xml'.format(sip_uuid)
    mets_path = os.path.join(sip_path, mets_name)

    identifiers = get_identifiers(sip_path)

    # If this is an AIC, find the number of AIP stored in it and index that
    aips_in_aic = None
    if sip_type == "AIC":
        try:
            uv = UnitVariable.objects.get(unittype="SIP",
                                          unituuid=sip_uuid,
                                          variable="AIPsinAIC")
            aips_in_aic = uv.variablevalue
        except UnitVariable.DoesNotExist:
            pass

    print('Indexing AIP info')
    # Delete ES index before creating new one if reingesting
    if 'REIN' in sip_type:
        print('Deleting outdated entry for AIP and AIP files with UUID',
              sip_uuid, 'from archival storage')
        elasticSearchFunctions.delete_aip(client, sip_uuid)
        elasticSearchFunctions.delete_aip_files(client, sip_uuid)

    # Index AIP
    elasticSearchFunctions.index_aip(client,
                                     sip_uuid,
                                     sip_name,
                                     aip_info['current_full_path'],
                                     mets_path,
                                     size=aip_info['size'],
                                     aips_in_aic=aips_in_aic,
                                     identifiers=identifiers,
                                     encrypted=aip_info['encrypted'])

    # Index AIP files
    print('Indexing AIP files')
    # Even though we treat MODS identifiers as SIP-level, we need to index them
    # here because the archival storage tab actually searches on the
    # aips/aipfile index.
    exitCode = elasticSearchFunctions.index_files(
        client,
        index='aips',
        type_='aipfile',
        uuid=sip_uuid,
        pathToArchive=sip_path,
        identifiers=identifiers,
        sipName=sip_name,
    )
    if exitCode == 1:
        print('Error indexing AIP files', file=sys.stderr)
        return 1

    return 0
    def handle(self, *args, **options):
        # Check root directory exists
        if not os.path.isdir(options['rootdir']):
            print("AIP store location doesn't exist.")
            sys.exit(1)

        # Verify ES is accessible
        elasticSearchFunctions.setup_reading_from_conf(django_settings)
        es_client = elasticSearchFunctions.get_client()

        try:
            es_client.info()
        except Exception:
            print("Error: Elasticsearch may not be running.")
            sys.exit(1)

        # Delete existing data also clears AIPS not found in the
        # provided directory
        if options['delete_all']:
            print('Deleting all AIPs in the AIP index')
            time.sleep(3)  # Time for the user to panic and kill the process
            es_client.indices.delete('aips', ignore=404)
            elasticSearchFunctions.create_indexes_if_needed(es_client)

        if not options['uuid']:
            print("Rebuilding AIPS index from AIPS in", options['rootdir'])
        else:
            print("Rebuilding AIP UUID", options['uuid'])

        temp_dir = tempfile.mkdtemp()
        count = 0
        name_regex = \
            r"-[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}"
        dir_regex = \
            r"-[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$"

        for root, directories, files in os.walk(options['rootdir']):
            # Ignore top-level directories inside ``rootdir`` that are not hex,
            # e.g. we walk ``0771`` but we're ignoring ``transferBacklog``.
            if root == options['rootdir']:
                directories[:] = [
                    d for d in directories if is_hex(d) and len(d) == 4
                ]

            # Uncompressed AIPs
            for directory in directories:
                # Check if dir name matches AIP name format
                match = re.search(dir_regex, directory)
                if not match:
                    continue
                # If running on a single AIP, skip all others
                if options['uuid'] and \
                        options['uuid'].lower() not in directory.lower():
                    continue
                count += 1
                processAIPThenDeleteMETSFile(
                    path=os.path.join(root, directory),
                    temp_dir=temp_dir,
                    es_client=es_client,
                    delete_existing_data=options['delete'],
                )
                # Don't recurse into this directory
                directories = directories.remove(directory)

            # Compressed AIPs
            for filename in files:
                # Check if filename matches AIP name format
                match = re.search(name_regex, filename)
                if not match:
                    continue
                # If running on a single AIP, skip all others
                if options['uuid'] and \
                        options['uuid'].lower() not in filename.lower():
                    continue
                count += 1
                processAIPThenDeleteMETSFile(
                    path=os.path.join(root, filename),
                    temp_dir=temp_dir,
                    es_client=es_client,
                    delete_existing_data=options['delete'],
                )

        print("Cleaning up")

        shutil.rmtree(temp_dir)

        print("Indexing complete. Indexed", count, "AIPs")